diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java index 7d6a8c0..9fdab12 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java @@ -7,6 +7,6 @@ import com.java2nb.novel.entity.Book; * */ public interface CrawlBookHandler { - void handle(Book book); + void handle(Book book) throws InterruptedException; } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 312ca43..95b0165 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -10,9 +10,11 @@ import com.java2nb.novel.utils.CrawlHttpClient; import io.github.xxyopen.util.IdWorker; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Component; +import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; @@ -26,6 +28,7 @@ import java.util.regex.Pattern; * * @author Administrator */ +@Slf4j @Component @RequiredArgsConstructor public class CrawlParser { @@ -34,8 +37,8 @@ public class CrawlParser { private final CrawlHttpClient crawlHttpClient; - @SneakyThrows - public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) { + public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) + throws InterruptedException { Book book = new Book(); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset()); @@ -120,8 +123,12 @@ public class CrawlParser { if (isFindUpdateTime) { String updateTime = updateTimeMatch.group(1); //设置更新时间 - book.setLastIndexUpdateTime( - new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); + try { + book.setLastIndexUpdateTime( + new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); + } catch (ParseException e) { + log.error("解析最新章节更新时间出错", e); + } } } @@ -144,7 +151,7 @@ public class CrawlParser { } public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, - Map existBookIndexMap, CrawlBookChapterHandler handler) { + Map existBookIndexMap, CrawlBookChapterHandler handler) throws InterruptedException{ Date currentDate = new Date(); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java index 6d7f9cf..a062ffc 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java @@ -74,10 +74,8 @@ public class StarterListener implements ServletContextInitializer { needUpdateBook.getId()); //解析章节目录 crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, - ruleBean, existBookIndexMap, chapter -> { - bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), - chapter.getBookContentList(), existBookIndexMap); - }); + ruleBean, existBookIndexMap, chapter -> bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), + chapter.getBookContentList(), existBookIndexMap)); }); } catch (Exception e) { log.error(e.getMessage(), e); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java index fdb36a5..f6dbb97 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java @@ -53,7 +53,7 @@ public interface CrawlService { * @param ruleBean 采集规则\ * @return true:成功,false:失败 * */ - boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId); + boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) throws InterruptedException; /** * 根据爬虫状态查询爬虫源集合 diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index f8743fe..18d75d3 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -249,6 +249,11 @@ public class CrawlServiceImpl implements CrawlService { @Override public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) { + String catIdRule = ruleBean.getCatIdRule().get("catId" + catId); + if (StringUtils.isBlank(catIdRule)) { + return; + } + //当前页码1 int page = 1; int totalPage = page; @@ -256,11 +261,7 @@ public class CrawlServiceImpl implements CrawlService { while (page <= totalPage) { try { - String catIdRule = ruleBean.getCatIdRule().get("catId" + catId); - if (StringUtils.isBlank(catIdRule) || Thread.currentThread().isInterrupted()) { - return; - } - String catBookListUrl = ""; + String catBookListUrl; if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) { // 兼容老规则 // 拼接分类URL @@ -290,6 +291,12 @@ public class CrawlServiceImpl implements CrawlService { String bookId = bookIdMatcher.group(1); parseBookAndSave(catId, ruleBean, sourceId, bookId); + } catch (InterruptedException e) { + log.error(e.getMessage(), e); + //1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时) + //捕获中断异常InterruptedException来退出线程。 + //2.非阻塞过程中通过判断中断标志来退出线程。 + return; } catch (Exception e) { log.error(e.getMessage(), e); } @@ -306,6 +313,12 @@ public class CrawlServiceImpl implements CrawlService { } } + } catch (InterruptedException e) { + log.error(e.getMessage(), e); + //1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时) + //捕获中断异常InterruptedException来退出线程。 + //2.非阻塞过程中通过判断中断标志来退出线程。 + return; } catch (Exception e) { log.error(e.getMessage(), e); } @@ -317,8 +330,12 @@ public class CrawlServiceImpl implements CrawlService { Thread.sleep(Duration.ofMinutes(1)); } catch (InterruptedException e) { log.error(e.getMessage(), e); + //1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时) + //捕获中断异常InterruptedException来退出线程。 + //2.非阻塞过程中通过判断中断标志来退出线程。 + return; } - }else{ + } else { page += 1; } } @@ -327,7 +344,8 @@ public class CrawlServiceImpl implements CrawlService { } @Override - public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) { + public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) + throws InterruptedException { final AtomicBoolean parseResult = new AtomicBoolean(false); @@ -391,4 +409,5 @@ public class CrawlServiceImpl implements CrawlService { .render(RenderingStrategies.MYBATIS3); return crawlSourceMapper.selectMany(render); } + } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java index 38e492b..cdc90f0 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java @@ -25,13 +25,9 @@ public class CrawlHttpClient { private static final ThreadLocal RETRY_COUNT = new ThreadLocal<>(); - public String get(String url, String charset) { + public String get(String url, String charset) throws InterruptedException { if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) { - try { - Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin); - } catch (InterruptedException e) { - log.error(e.getMessage(), e); - } + Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin); } String body = HttpUtil.getByHttpClientWithChrome(url, charset); if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) { @@ -41,7 +37,7 @@ public class CrawlHttpClient { return body; } - private String processErrorHttpResult(String url, String charset) { + private String processErrorHttpResult(String url, String charset) throws InterruptedException{ Integer count = RETRY_COUNT.get(); if (count == null) { count = 0;