mirror of
https://github.com/201206030/novel-plus.git
synced 2025-05-17 07:18:29 +00:00
fix(novel-crawl): 调整线程终止逻辑
This commit is contained in:
parent
2c86cb9a7d
commit
1f53b56bd6
@ -7,6 +7,6 @@ import com.java2nb.novel.entity.Book;
|
|||||||
* */
|
* */
|
||||||
public interface CrawlBookHandler {
|
public interface CrawlBookHandler {
|
||||||
|
|
||||||
void handle(Book book);
|
void handle(Book book) throws InterruptedException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,9 +10,11 @@ import com.java2nb.novel.utils.CrawlHttpClient;
|
|||||||
import io.github.xxyopen.util.IdWorker;
|
import io.github.xxyopen.util.IdWorker;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
@ -26,6 +28,7 @@ import java.util.regex.Pattern;
|
|||||||
*
|
*
|
||||||
* @author Administrator
|
* @author Administrator
|
||||||
*/
|
*/
|
||||||
|
@Slf4j
|
||||||
@Component
|
@Component
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class CrawlParser {
|
public class CrawlParser {
|
||||||
@ -34,8 +37,8 @@ public class CrawlParser {
|
|||||||
|
|
||||||
private final CrawlHttpClient crawlHttpClient;
|
private final CrawlHttpClient crawlHttpClient;
|
||||||
|
|
||||||
@SneakyThrows
|
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler)
|
||||||
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
throws InterruptedException {
|
||||||
Book book = new Book();
|
Book book = new Book();
|
||||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||||
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
|
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
|
||||||
@ -120,8 +123,12 @@ public class CrawlParser {
|
|||||||
if (isFindUpdateTime) {
|
if (isFindUpdateTime) {
|
||||||
String updateTime = updateTimeMatch.group(1);
|
String updateTime = updateTimeMatch.group(1);
|
||||||
//设置更新时间
|
//设置更新时间
|
||||||
|
try {
|
||||||
book.setLastIndexUpdateTime(
|
book.setLastIndexUpdateTime(
|
||||||
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
|
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
|
||||||
|
} catch (ParseException e) {
|
||||||
|
log.error("解析最新章节更新时间出错", e);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -144,7 +151,7 @@ public class CrawlParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
|
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
|
||||||
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
|
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) throws InterruptedException{
|
||||||
|
|
||||||
Date currentDate = new Date();
|
Date currentDate = new Date();
|
||||||
|
|
||||||
|
@ -74,10 +74,8 @@ public class StarterListener implements ServletContextInitializer {
|
|||||||
needUpdateBook.getId());
|
needUpdateBook.getId());
|
||||||
//解析章节目录
|
//解析章节目录
|
||||||
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
|
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
|
||||||
ruleBean, existBookIndexMap, chapter -> {
|
ruleBean, existBookIndexMap, chapter -> bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
|
||||||
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
|
chapter.getBookContentList(), existBookIndexMap));
|
||||||
chapter.getBookContentList(), existBookIndexMap);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
|
@ -53,7 +53,7 @@ public interface CrawlService {
|
|||||||
* @param ruleBean 采集规则\
|
* @param ruleBean 采集规则\
|
||||||
* @return true:成功,false:失败
|
* @return true:成功,false:失败
|
||||||
* */
|
* */
|
||||||
boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId);
|
boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) throws InterruptedException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 根据爬虫状态查询爬虫源集合
|
* 根据爬虫状态查询爬虫源集合
|
||||||
|
@ -249,6 +249,11 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
@Override
|
@Override
|
||||||
public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) {
|
public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) {
|
||||||
|
|
||||||
|
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId);
|
||||||
|
if (StringUtils.isBlank(catIdRule)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
//当前页码1
|
//当前页码1
|
||||||
int page = 1;
|
int page = 1;
|
||||||
int totalPage = page;
|
int totalPage = page;
|
||||||
@ -256,11 +261,7 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
while (page <= totalPage) {
|
while (page <= totalPage) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId);
|
String catBookListUrl;
|
||||||
if (StringUtils.isBlank(catIdRule) || Thread.currentThread().isInterrupted()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
String catBookListUrl = "";
|
|
||||||
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
|
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
|
||||||
// 兼容老规则
|
// 兼容老规则
|
||||||
// 拼接分类URL
|
// 拼接分类URL
|
||||||
@ -290,6 +291,12 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
|
|
||||||
String bookId = bookIdMatcher.group(1);
|
String bookId = bookIdMatcher.group(1);
|
||||||
parseBookAndSave(catId, ruleBean, sourceId, bookId);
|
parseBookAndSave(catId, ruleBean, sourceId, bookId);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
||||||
|
//捕获中断异常InterruptedException来退出线程。
|
||||||
|
//2.非阻塞过程中通过判断中断标志来退出线程。
|
||||||
|
return;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
@ -306,6 +313,12 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
||||||
|
//捕获中断异常InterruptedException来退出线程。
|
||||||
|
//2.非阻塞过程中通过判断中断标志来退出线程。
|
||||||
|
return;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
@ -317,6 +330,10 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
Thread.sleep(Duration.ofMinutes(1));
|
Thread.sleep(Duration.ofMinutes(1));
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
|
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
||||||
|
//捕获中断异常InterruptedException来退出线程。
|
||||||
|
//2.非阻塞过程中通过判断中断标志来退出线程。
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
page += 1;
|
page += 1;
|
||||||
@ -327,7 +344,8 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) {
|
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId)
|
||||||
|
throws InterruptedException {
|
||||||
|
|
||||||
final AtomicBoolean parseResult = new AtomicBoolean(false);
|
final AtomicBoolean parseResult = new AtomicBoolean(false);
|
||||||
|
|
||||||
@ -391,4 +409,5 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
.render(RenderingStrategies.MYBATIS3);
|
.render(RenderingStrategies.MYBATIS3);
|
||||||
return crawlSourceMapper.selectMany(render);
|
return crawlSourceMapper.selectMany(render);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -25,13 +25,9 @@ public class CrawlHttpClient {
|
|||||||
|
|
||||||
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
||||||
|
|
||||||
public String get(String url, String charset) {
|
public String get(String url, String charset) throws InterruptedException {
|
||||||
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
||||||
try {
|
|
||||||
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
||||||
} catch (InterruptedException e) {
|
|
||||||
log.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
|
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
|
||||||
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||||
@ -41,7 +37,7 @@ public class CrawlHttpClient {
|
|||||||
return body;
|
return body;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String processErrorHttpResult(String url, String charset) {
|
private String processErrorHttpResult(String url, String charset) throws InterruptedException{
|
||||||
Integer count = RETRY_COUNT.get();
|
Integer count = RETRY_COUNT.get();
|
||||||
if (count == null) {
|
if (count == null) {
|
||||||
count = 0;
|
count = 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user