爬虫部分代码重构,准备适配TXT文本存储方案

This commit is contained in:
xiaoyang 2021-08-17 11:08:51 +08:00
parent 7f0331e095
commit cbfd0b049f
5 changed files with 89 additions and 55 deletions

View File

@ -0,0 +1,12 @@
package com.java2nb.novel.core.crawl;
import com.java2nb.novel.entity.Book;
/**
* 爬虫小说章节内容处理器
* */
public interface CrawlBookChapterHandler {
void handle(ChapterBean chapterBean);
}

View File

@ -0,0 +1,12 @@
package com.java2nb.novel.core.crawl;
import com.java2nb.novel.entity.Book;
/**
* 爬虫小说处理器
* */
public interface CrawlBookHandler {
void handle(Book book);
}

View File

@ -33,7 +33,7 @@ public class CrawlParser {
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>(); private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();
@SneakyThrows @SneakyThrows
public static Book parseBook(RuleBean ruleBean, String bookId) { public static void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
Book book = new Book(); Book book = new Book();
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl); String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
@ -135,10 +135,10 @@ public class CrawlParser {
} }
} }
} }
return book; handler.handle(book);
} }
public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap) { public static void parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
Date currentDate = new Date(); Date currentDate = new Date();
@ -228,7 +228,7 @@ public class CrawlParser {
bookContent.setIndexId(hasIndex.getId()); bookContent.setIndexId(hasIndex.getId());
//计算总字数 //计算总字数
totalWordCount = (totalWordCount+wordCount-hasIndex.getWordCount()); totalWordCount = (totalWordCount + wordCount - hasIndex.getWordCount());
} else { } else {
//章节插入 //章节插入
//设置目录和章节内容 //设置目录和章节内容
@ -246,7 +246,6 @@ public class CrawlParser {
bookIndex.setUpdateTime(currentDate); bookIndex.setUpdateTime(currentDate);
} }
@ -259,7 +258,7 @@ public class CrawlParser {
if (indexList.size() > 0) { if (indexList.size() > 0) {
//如果有爬到最新章节则设置小说主表的最新章节信息 //如果有爬到最新章节则设置小说主表的最新章节信息
//获取爬取到的最新章节 //获取爬取到的最新章节
BookIndex lastIndex = indexList.get(indexList.size()-1); BookIndex lastIndex = indexList.get(indexList.size() - 1);
book.setLastIndexId(lastIndex.getId()); book.setLastIndexId(lastIndex.getId());
book.setLastIndexName(lastIndex.getIndexName()); book.setLastIndexName(lastIndex.getIndexName());
book.setLastIndexUpdateTime(currentDate); book.setLastIndexUpdateTime(currentDate);
@ -270,20 +269,22 @@ public class CrawlParser {
if (indexList.size() == contentList.size() && indexList.size() > 0) { if (indexList.size() == contentList.size() && indexList.size() > 0) {
return new ChapterBean(){{ handler.handle(new ChapterBean() {{
setBookIndexList(indexList); setBookIndexList(indexList);
setBookContentList(contentList); setBookContentList(contentList);
}}; }});
return;
} }
} }
handler.handle(new ChapterBean() {{
return new ChapterBean(){{
setBookIndexList(new ArrayList<>(0)); setBookIndexList(new ArrayList<>(0));
setBookContentList(new ArrayList<>(0)); setBookContentList(new ArrayList<>(0));
}}; }});
} }

View File

@ -56,19 +56,21 @@ public class StarterListener implements ServletContextListener {
CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId()); CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId());
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class); RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
//解析小说基本信息 //解析小说基本信息
Book book = CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId()); CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(),book -> {
//这里只做老书更新 //这里只做老书更新
book.setId(needUpdateBook.getId()); book.setId(needUpdateBook.getId());
book.setWordCount(needUpdateBook.getWordCount()); book.setWordCount(needUpdateBook.getWordCount());
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) { if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
//本地图片则不更新 //本地图片则不更新
book.setPicUrl(null); book.setPicUrl(null);
} }
//查询已存在的章节 //查询已存在的章节
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId()); Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
//解析章节目录 //解析章节目录
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap); CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap,chapter -> {
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap); bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
});
});
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
} }

View File

@ -5,7 +5,6 @@ import com.github.pagehelper.PageHelper;
import com.java2nb.novel.core.bean.PageBean; import com.java2nb.novel.core.bean.PageBean;
import com.java2nb.novel.core.cache.CacheKey; import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService; import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.crawl.ChapterBean;
import com.java2nb.novel.core.crawl.CrawlParser; import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean; import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.core.enums.ResponseStatus; import com.java2nb.novel.core.enums.ResponseStatus;
@ -34,6 +33,7 @@ import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.util.*; import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -274,39 +274,46 @@ public class CrawlServiceImpl implements CrawlService {
@Override @Override
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) { public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) {
Book book = CrawlParser.parseBook(ruleBean, bookId);
if(book.getBookName() == null || book.getAuthorName() == null){ final AtomicBoolean parseResult = new AtomicBoolean(false);
return false;
} CrawlParser.parseBook(ruleBean, bookId, book -> {
//这里只做新书入库查询是否存在这本书 if(book.getBookName() == null || book.getAuthorName() == null){
Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName()); return;
//如果该小说不存在则可以解析入库但是标记该小说正在入库30分钟之后才允许再次入库
if (existBook == null) {
//没有该书可以入库
book.setCatId(catId);
//根据分类ID查询分类
book.setCatName(bookService.queryCatNameByCatId(catId));
if (catId == 7) {
//女频
book.setWorkDirection((byte) 1);
} else {
//男频
book.setWorkDirection((byte) 0);
} }
book.setCrawlBookId(bookId); //这里只做新书入库查询是否存在这本书
book.setCrawlSourceId(sourceId); Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
book.setCrawlLastTime(new Date()); //如果该小说不存在则可以解析入库但是标记该小说正在入库30分钟之后才允许再次入库
book.setId(new IdWorker().nextId()); if (existBook == null) {
//解析章节目录 //没有该书可以入库
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0)); book.setCatId(catId);
//根据分类ID查询分类
book.setCatName(bookService.queryCatNameByCatId(catId));
if (catId == 7) {
//女频
book.setWorkDirection((byte) 1);
} else {
//男频
book.setWorkDirection((byte) 0);
}
book.setCrawlBookId(bookId);
book.setCrawlSourceId(sourceId);
book.setCrawlLastTime(new Date());
book.setId(new IdWorker().nextId());
//解析章节目录
CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0),chapter -> {
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
});
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList()); } else {
//只更新书籍的爬虫相关字段
bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
}
parseResult.set(true);
});
return parseResult.get();
} else {
//只更新书籍的爬虫相关字段
bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
}
return true;
} }
@Override @Override