mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-26 17:20:52 +00:00
部分爬虫代码优化
This commit is contained in:
parent
b5df86d5c7
commit
4fe36a8f4f
@ -0,0 +1,25 @@
|
||||
package com.java2nb.novel.core.crawl;
|
||||
|
||||
import com.java2nb.novel.entity.BookContent;
|
||||
import com.java2nb.novel.entity.BookIndex;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 章节数据封装bean
|
||||
* @author Administrator
|
||||
*/
|
||||
@Data
|
||||
public class ChapterBean {
|
||||
|
||||
/**
|
||||
* 章节索引集合
|
||||
* */
|
||||
List<BookIndex> bookIndexList;
|
||||
|
||||
/**
|
||||
* 章节内容集合
|
||||
* */
|
||||
List<BookContent> bookContentList;
|
||||
}
|
@ -26,15 +26,11 @@ import static java.util.regex.Pattern.compile;
|
||||
@Slf4j
|
||||
public class CrawlParser {
|
||||
|
||||
private static IdWorker idWorker = new IdWorker();
|
||||
private static final IdWorker idWorker = new IdWorker();
|
||||
|
||||
public static final Integer BOOK_INDEX_LIST_KEY = 1;
|
||||
private static final RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
||||
|
||||
public static final Integer BOOK_CONTENT_LIST_KEY = 2;
|
||||
|
||||
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
||||
|
||||
private static ThreadLocal<Integer> retryCount = new ThreadLocal<>();
|
||||
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();
|
||||
|
||||
@SneakyThrows
|
||||
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
||||
@ -113,14 +109,14 @@ public class CrawlParser {
|
||||
}
|
||||
}
|
||||
|
||||
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) {
|
||||
Pattern updateTimePatten = compile(ruleBean.getUpadateTimePatten());
|
||||
if (StringUtils.isNotBlank(ruleBean.getUpdateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpdateTimeFormatPatten())) {
|
||||
Pattern updateTimePatten = compile(ruleBean.getUpdateTimePatten());
|
||||
Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml);
|
||||
boolean isFindUpdateTime = updateTimeMatch.find();
|
||||
if (isFindUpdateTime) {
|
||||
String updateTime = updateTimeMatch.group(1);
|
||||
//设置更新时间
|
||||
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
|
||||
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpdateTimeFormatPatten()).parse(updateTime));
|
||||
|
||||
}
|
||||
}
|
||||
@ -142,10 +138,7 @@ public class CrawlParser {
|
||||
return book;
|
||||
}
|
||||
|
||||
public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> hasIndexs) {
|
||||
Map<Integer, List> result = new HashMap<>(2);
|
||||
result.put(BOOK_INDEX_LIST_KEY, new ArrayList(0));
|
||||
result.put(BOOK_CONTENT_LIST_KEY, new ArrayList(0));
|
||||
public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap) {
|
||||
|
||||
Date currentDate = new Date();
|
||||
|
||||
@ -171,11 +164,11 @@ public class CrawlParser {
|
||||
int indexNum = 0;
|
||||
|
||||
//总字数
|
||||
Integer totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount();
|
||||
int totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount();
|
||||
|
||||
while (isFindIndex) {
|
||||
|
||||
BookIndex hasIndex = hasIndexs.get(indexNum);
|
||||
BookIndex hasIndex = existBookIndexMap.get(indexNum);
|
||||
String indexName = indexNameMatch.group(1);
|
||||
|
||||
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) {
|
||||
@ -221,7 +214,7 @@ public class CrawlParser {
|
||||
BookIndex bookIndex = new BookIndex();
|
||||
bookIndex.setIndexName(indexName);
|
||||
bookIndex.setIndexNum(indexNum);
|
||||
Integer wordCount = StringUtil.getStrValidWordCount(content);
|
||||
int wordCount = StringUtil.getStrValidWordCount(content);
|
||||
bookIndex.setWordCount(wordCount);
|
||||
indexList.add(bookIndex);
|
||||
|
||||
@ -277,15 +270,20 @@ public class CrawlParser {
|
||||
|
||||
if (indexList.size() == contentList.size() && indexList.size() > 0) {
|
||||
|
||||
result.put(BOOK_INDEX_LIST_KEY, indexList);
|
||||
result.put(BOOK_CONTENT_LIST_KEY, contentList);
|
||||
return new ChapterBean(){{
|
||||
setBookIndexList(indexList);
|
||||
setBookContentList(contentList);
|
||||
}};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
return result;
|
||||
return new ChapterBean(){{
|
||||
setBookIndexList(new ArrayList<>(0));
|
||||
setBookContentList(new ArrayList<>(0));
|
||||
}};
|
||||
}
|
||||
|
||||
|
||||
@ -294,6 +292,7 @@ public class CrawlParser {
|
||||
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
|
||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||
String body = forEntity.getBody();
|
||||
assert body != null;
|
||||
if (body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||
return processErrorHttpResult(url);
|
||||
}
|
||||
|
@ -37,8 +37,8 @@ public class RuleBean {
|
||||
private String visitCountPatten;
|
||||
private String descStart;;
|
||||
private String descEnd;
|
||||
private String upadateTimePatten;
|
||||
private String upadateTimeFormatPatten;
|
||||
private String updateTimePatten;
|
||||
private String updateTimeFormatPatten;
|
||||
private String bookIndexUrl;
|
||||
private String indexIdPatten;
|
||||
private String indexNamePatten;
|
||||
|
@ -1,6 +1,7 @@
|
||||
package com.java2nb.novel.core.listener;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.java2nb.novel.core.crawl.ChapterBean;
|
||||
import com.java2nb.novel.core.crawl.CrawlParser;
|
||||
import com.java2nb.novel.core.crawl.RuleBean;
|
||||
import com.java2nb.novel.entity.*;
|
||||
@ -16,9 +17,9 @@ import javax.servlet.ServletContextEvent;
|
||||
import javax.servlet.ServletContextListener;
|
||||
import javax.servlet.annotation.WebListener;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* @author Administrator
|
||||
@ -66,15 +67,15 @@ public class StarterListener implements ServletContextListener {
|
||||
//查询已存在的章节
|
||||
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
|
||||
//解析章节目录
|
||||
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
|
||||
bookService.updateBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY), existBookIndexMap);
|
||||
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
|
||||
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Thread.sleep(1000 * 60 * 10);
|
||||
// 休眠10分钟
|
||||
TimeUnit.MINUTES.sleep(10);
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
@ -107,7 +108,8 @@ public class StarterListener implements ServletContextListener {
|
||||
|
||||
}
|
||||
|
||||
Thread.sleep(1000 * 60);
|
||||
//休眠1分钟
|
||||
TimeUnit.MINUTES.sleep(1);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
|
@ -7,6 +7,7 @@ import com.github.pagehelper.PageInfo;
|
||||
import com.java2nb.novel.core.bean.PageBean;
|
||||
import com.java2nb.novel.core.cache.CacheKey;
|
||||
import com.java2nb.novel.core.cache.CacheService;
|
||||
import com.java2nb.novel.core.crawl.ChapterBean;
|
||||
import com.java2nb.novel.core.crawl.CrawlParser;
|
||||
import com.java2nb.novel.core.crawl.RuleBean;
|
||||
import com.java2nb.novel.core.enums.ResponseStatus;
|
||||
@ -303,9 +304,9 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
book.setCrawlLastTime(new Date());
|
||||
book.setId(new IdWorker().nextId());
|
||||
//解析章节目录
|
||||
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
|
||||
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
|
||||
|
||||
bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
|
||||
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
|
||||
|
||||
} else {
|
||||
//只更新书籍的爬虫相关字段
|
||||
|
Loading…
x
Reference in New Issue
Block a user