部分爬虫代码优化

This commit is contained in:
xiaoyang 2021-07-24 15:51:54 +08:00
parent b5df86d5c7
commit 4fe36a8f4f
5 changed files with 57 additions and 30 deletions

View File

@ -0,0 +1,25 @@
package com.java2nb.novel.core.crawl;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import lombok.Data;
import java.util.List;
/**
* 章节数据封装bean
* @author Administrator
*/
@Data
public class ChapterBean {
/**
* 章节索引集合
* */
List<BookIndex> bookIndexList;
/**
* 章节内容集合
* */
List<BookContent> bookContentList;
}

View File

@ -26,15 +26,11 @@ import static java.util.regex.Pattern.compile;
@Slf4j
public class CrawlParser {
private static IdWorker idWorker = new IdWorker();
private static final IdWorker idWorker = new IdWorker();
public static final Integer BOOK_INDEX_LIST_KEY = 1;
private static final RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
public static final Integer BOOK_CONTENT_LIST_KEY = 2;
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
private static ThreadLocal<Integer> retryCount = new ThreadLocal<>();
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();
@SneakyThrows
public static Book parseBook(RuleBean ruleBean, String bookId) {
@ -113,14 +109,14 @@ public class CrawlParser {
}
}
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) {
Pattern updateTimePatten = compile(ruleBean.getUpadateTimePatten());
if (StringUtils.isNotBlank(ruleBean.getUpdateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpdateTimeFormatPatten())) {
Pattern updateTimePatten = compile(ruleBean.getUpdateTimePatten());
Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml);
boolean isFindUpdateTime = updateTimeMatch.find();
if (isFindUpdateTime) {
String updateTime = updateTimeMatch.group(1);
//设置更新时间
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpdateTimeFormatPatten()).parse(updateTime));
}
}
@ -142,10 +138,7 @@ public class CrawlParser {
return book;
}
public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> hasIndexs) {
Map<Integer, List> result = new HashMap<>(2);
result.put(BOOK_INDEX_LIST_KEY, new ArrayList(0));
result.put(BOOK_CONTENT_LIST_KEY, new ArrayList(0));
public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap) {
Date currentDate = new Date();
@ -171,11 +164,11 @@ public class CrawlParser {
int indexNum = 0;
//总字数
Integer totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount();
int totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount();
while (isFindIndex) {
BookIndex hasIndex = hasIndexs.get(indexNum);
BookIndex hasIndex = existBookIndexMap.get(indexNum);
String indexName = indexNameMatch.group(1);
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) {
@ -221,7 +214,7 @@ public class CrawlParser {
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
Integer wordCount = StringUtil.getStrValidWordCount(content);
int wordCount = StringUtil.getStrValidWordCount(content);
bookIndex.setWordCount(wordCount);
indexList.add(bookIndex);
@ -277,15 +270,20 @@ public class CrawlParser {
if (indexList.size() == contentList.size() && indexList.size() > 0) {
result.put(BOOK_INDEX_LIST_KEY, indexList);
result.put(BOOK_CONTENT_LIST_KEY, contentList);
return new ChapterBean(){{
setBookIndexList(indexList);
setBookContentList(contentList);
}};
}
}
return result;
return new ChapterBean(){{
setBookIndexList(new ArrayList<>(0));
setBookContentList(new ArrayList<>(0));
}};
}
@ -294,6 +292,7 @@ public class CrawlParser {
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
String body = forEntity.getBody();
assert body != null;
if (body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url);
}

View File

@ -37,8 +37,8 @@ public class RuleBean {
private String visitCountPatten;
private String descStart;;
private String descEnd;
private String upadateTimePatten;
private String upadateTimeFormatPatten;
private String updateTimePatten;
private String updateTimeFormatPatten;
private String bookIndexUrl;
private String indexIdPatten;
private String indexNamePatten;

View File

@ -1,6 +1,7 @@
package com.java2nb.novel.core.listener;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.java2nb.novel.core.crawl.ChapterBean;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.entity.*;
@ -16,9 +17,9 @@ import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
import javax.servlet.annotation.WebListener;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* @author Administrator
@ -66,15 +67,15 @@ public class StarterListener implements ServletContextListener {
//查询已存在的章节
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
bookService.updateBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY), existBookIndexMap);
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
Thread.sleep(1000 * 60 * 10);
// 休眠10分钟
TimeUnit.MINUTES.sleep(10);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
@ -107,7 +108,8 @@ public class StarterListener implements ServletContextListener {
}
Thread.sleep(1000 * 60);
//休眠1分钟
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
log.error(e.getMessage(), e);

View File

@ -7,6 +7,7 @@ import com.github.pagehelper.PageInfo;
import com.java2nb.novel.core.bean.PageBean;
import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.crawl.ChapterBean;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.core.enums.ResponseStatus;
@ -303,9 +304,9 @@ public class CrawlServiceImpl implements CrawlService {
book.setCrawlLastTime(new Date());
book.setId(new IdWorker().nextId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
} else {
//只更新书籍的爬虫相关字段