Compare commits

..

4 Commits

7 changed files with 55 additions and 50 deletions

View File

@ -1,5 +1,4 @@
<p align="center">
<a href="https://www.swiftproxy.net/?code=T2WV1VT50"><img src="https://xxyopen.com/images/ad1.png" alt="AD" ></a>
<a href="https://cloud.tencent.com/act/cps/redirect?redirect=2446&cps_key=736e609d66e0ac4e57813316cec6fd0b&from=console"><img src="https://youdoc.github.io/img/tencent.jpg" alt="AD" ></a>
</p>
<p align="center">

View File

@ -5,7 +5,7 @@
<groupId>com.java2nb</groupId>
<artifactId>novel-admin</artifactId>
<version>5.1.1</version>
<version>5.1.2</version>
<packaging>jar</packaging>
<name>novel-admin</name>

View File

@ -5,7 +5,7 @@
<parent>
<artifactId>novel</artifactId>
<groupId>com.java2nb</groupId>
<version>5.1.1</version>
<version>5.1.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -5,7 +5,7 @@
<parent>
<artifactId>novel</artifactId>
<groupId>com.java2nb</groupId>
<version>5.1.1</version>
<version>5.1.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -34,6 +34,7 @@ import org.mybatis.dynamic.sql.render.RenderingStrategies;
import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
import org.springframework.stereotype.Service;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
@ -256,65 +257,70 @@ public class CrawlServiceImpl implements CrawlService {
try {
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId);
if (StringUtils.isNotBlank(catIdRule)) {
String catBookListUrl = "";
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
// 兼容老规则
// 拼接分类URL
catBookListUrl = ruleBean.getBookListUrl()
.replace("{catId}", catIdRule)
.replace("{page}", page + "");
} else {
// 新规则
// 拼接分类URL
catBookListUrl = catIdRule.replace("{page}", page + "");
}
log.info("catBookListUrl{}", catBookListUrl);
if (StringUtils.isBlank(catIdRule) || Thread.currentThread().isInterrupted()) {
return;
}
String catBookListUrl = "";
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
// 兼容老规则
// 拼接分类URL
catBookListUrl = ruleBean.getBookListUrl()
.replace("{catId}", catIdRule)
.replace("{page}", page + "");
} else {
// 新规则
// 拼接分类URL
catBookListUrl = catIdRule.replace("{page}", page + "");
}
log.info("catBookListUrl{}", catBookListUrl);
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
if (bookListHtml != null) {
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
boolean isFindBookId = bookIdMatcher.find();
while (isFindBookId) {
try {
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
//捕获中断异常InterruptedException来退出线程。
//2.非阻塞过程中通过判断中断标志来退出线程。
if (Thread.currentThread().isInterrupted()) {
return;
}
String bookId = bookIdMatcher.group(1);
parseBookAndSave(catId, ruleBean, sourceId, bookId);
} catch (Exception e) {
log.error(e.getMessage(), e);
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
if (bookListHtml != null) {
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
boolean isFindBookId = bookIdMatcher.find();
while (isFindBookId) {
try {
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
//捕获中断异常InterruptedException来退出线程。
//2.非阻塞过程中通过判断中断标志来退出线程。
if (Thread.currentThread().isInterrupted()) {
return;
}
isFindBookId = bookIdMatcher.find();
String bookId = bookIdMatcher.group(1);
parseBookAndSave(catId, ruleBean, sourceId, bookId);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten());
Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml);
boolean isFindTotalPage = totalPageMatcher.find();
if (isFindTotalPage) {
isFindBookId = bookIdMatcher.find();
}
totalPage = Integer.parseInt(totalPageMatcher.group(1));
}
Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten());
Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml);
boolean isFindTotalPage = totalPageMatcher.find();
if (isFindTotalPage) {
totalPage = Integer.parseInt(totalPageMatcher.group(1));
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
if (page == totalPage) {
if (page >= totalPage) {
// 第一遍采集完成,翻到第一页,继续第二次采集,适用于分页数比较少的最近更新列表
page = 0;
page = 1;
try {
// 第一遍采集完成休眠1分钟
Thread.sleep(Duration.ofMinutes(1));
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
}
}else{
page += 1;
}
page += 1;
}

View File

@ -5,7 +5,7 @@
<parent>
<artifactId>novel</artifactId>
<groupId>com.java2nb</groupId>
<version>5.1.1</version>
<version>5.1.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -5,7 +5,7 @@
<groupId>com.java2nb</groupId>
<artifactId>novel</artifactId>
<version>5.1.1</version>
<version>5.1.2</version>
<modules>
<module>novel-common</module>
<module>novel-front</module>