From 55d5deea745a3f191298e0641b0571c6d0049bd9 Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <1179705413@qq.com> Date: Mon, 12 May 2025 17:48:24 +0800 Subject: [PATCH] =?UTF-8?q?fix(novel-crawl):=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E6=BA=90=E6=97=A0=E6=B3=95=E5=81=9C=E6=AD=A2=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../novel/service/impl/CrawlServiceImpl.java | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index b3bc75b..4610291 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -256,53 +256,52 @@ public class CrawlServiceImpl implements CrawlService { try { String catIdRule = ruleBean.getCatIdRule().get("catId" + catId); - if (StringUtils.isNotBlank(catIdRule)) { - String catBookListUrl = ""; - if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) { - // 兼容老规则 - // 拼接分类URL - catBookListUrl = ruleBean.getBookListUrl() - .replace("{catId}", catIdRule) - .replace("{page}", page + ""); - } else { - // 新规则 - // 拼接分类URL - catBookListUrl = catIdRule.replace("{page}", page + ""); - } - log.info("catBookListUrl:{}", catBookListUrl); + if (StringUtils.isBlank(catIdRule) || Thread.currentThread().isInterrupted()) { + return; + } + String catBookListUrl = ""; + if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) { + // 兼容老规则 + // 拼接分类URL + catBookListUrl = ruleBean.getBookListUrl() + .replace("{catId}", catIdRule) + .replace("{page}", page + ""); + } else { + // 新规则 + // 拼接分类URL + catBookListUrl = catIdRule.replace("{page}", page + ""); + } + log.info("catBookListUrl:{}", catBookListUrl); - String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset()); - if (bookListHtml != null) { - Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten()); - Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml); - boolean isFindBookId = bookIdMatcher.find(); - while (isFindBookId) { - try { - //1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时) - //捕获中断异常InterruptedException来退出线程。 - //2.非阻塞过程中通过判断中断标志来退出线程。 - if (Thread.currentThread().isInterrupted()) { - return; - } - - String bookId = bookIdMatcher.group(1); - parseBookAndSave(catId, ruleBean, sourceId, bookId); - } catch (Exception e) { - log.error(e.getMessage(), e); + String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset()); + if (bookListHtml != null) { + Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten()); + Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml); + boolean isFindBookId = bookIdMatcher.find(); + while (isFindBookId) { + try { + //1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时) + //捕获中断异常InterruptedException来退出线程。 + //2.非阻塞过程中通过判断中断标志来退出线程。 + if (Thread.currentThread().isInterrupted()) { + return; } - isFindBookId = bookIdMatcher.find(); + String bookId = bookIdMatcher.group(1); + parseBookAndSave(catId, ruleBean, sourceId, bookId); + } catch (Exception e) { + log.error(e.getMessage(), e); } - Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten()); - Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml); - boolean isFindTotalPage = totalPageMatcher.find(); - if (isFindTotalPage) { + isFindBookId = bookIdMatcher.find(); + } - totalPage = Integer.parseInt(totalPageMatcher.group(1)); - - } + Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten()); + Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml); + boolean isFindTotalPage = totalPageMatcher.find(); + if (isFindTotalPage) { + totalPage = Integer.parseInt(totalPageMatcher.group(1)); } }