mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-27 01:30:51 +00:00
爬虫部分代码重构,准备适配TXT文本存储方案
This commit is contained in:
parent
7f0331e095
commit
cbfd0b049f
@ -0,0 +1,12 @@
|
|||||||
|
package com.java2nb.novel.core.crawl;
|
||||||
|
|
||||||
|
import com.java2nb.novel.entity.Book;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 爬虫小说章节内容处理器
|
||||||
|
* */
|
||||||
|
public interface CrawlBookChapterHandler {
|
||||||
|
|
||||||
|
void handle(ChapterBean chapterBean);
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,12 @@
|
|||||||
|
package com.java2nb.novel.core.crawl;
|
||||||
|
|
||||||
|
import com.java2nb.novel.entity.Book;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 爬虫小说处理器
|
||||||
|
* */
|
||||||
|
public interface CrawlBookHandler {
|
||||||
|
|
||||||
|
void handle(Book book);
|
||||||
|
|
||||||
|
}
|
@ -33,7 +33,7 @@ public class CrawlParser {
|
|||||||
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();
|
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
public static void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
||||||
Book book = new Book();
|
Book book = new Book();
|
||||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||||
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
|
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
|
||||||
@ -135,10 +135,10 @@ public class CrawlParser {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return book;
|
handler.handle(book);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap) {
|
public static void parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
|
||||||
|
|
||||||
Date currentDate = new Date();
|
Date currentDate = new Date();
|
||||||
|
|
||||||
@ -228,7 +228,7 @@ public class CrawlParser {
|
|||||||
bookContent.setIndexId(hasIndex.getId());
|
bookContent.setIndexId(hasIndex.getId());
|
||||||
|
|
||||||
//计算总字数
|
//计算总字数
|
||||||
totalWordCount = (totalWordCount+wordCount-hasIndex.getWordCount());
|
totalWordCount = (totalWordCount + wordCount - hasIndex.getWordCount());
|
||||||
} else {
|
} else {
|
||||||
//章节插入
|
//章节插入
|
||||||
//设置目录和章节内容
|
//设置目录和章节内容
|
||||||
@ -246,7 +246,6 @@ public class CrawlParser {
|
|||||||
bookIndex.setUpdateTime(currentDate);
|
bookIndex.setUpdateTime(currentDate);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -259,7 +258,7 @@ public class CrawlParser {
|
|||||||
if (indexList.size() > 0) {
|
if (indexList.size() > 0) {
|
||||||
//如果有爬到最新章节,则设置小说主表的最新章节信息
|
//如果有爬到最新章节,则设置小说主表的最新章节信息
|
||||||
//获取爬取到的最新章节
|
//获取爬取到的最新章节
|
||||||
BookIndex lastIndex = indexList.get(indexList.size()-1);
|
BookIndex lastIndex = indexList.get(indexList.size() - 1);
|
||||||
book.setLastIndexId(lastIndex.getId());
|
book.setLastIndexId(lastIndex.getId());
|
||||||
book.setLastIndexName(lastIndex.getIndexName());
|
book.setLastIndexName(lastIndex.getIndexName());
|
||||||
book.setLastIndexUpdateTime(currentDate);
|
book.setLastIndexUpdateTime(currentDate);
|
||||||
@ -270,20 +269,22 @@ public class CrawlParser {
|
|||||||
|
|
||||||
if (indexList.size() == contentList.size() && indexList.size() > 0) {
|
if (indexList.size() == contentList.size() && indexList.size() > 0) {
|
||||||
|
|
||||||
return new ChapterBean(){{
|
handler.handle(new ChapterBean() {{
|
||||||
setBookIndexList(indexList);
|
setBookIndexList(indexList);
|
||||||
setBookContentList(contentList);
|
setBookContentList(contentList);
|
||||||
}};
|
}});
|
||||||
|
|
||||||
|
return;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
handler.handle(new ChapterBean() {{
|
||||||
return new ChapterBean(){{
|
|
||||||
setBookIndexList(new ArrayList<>(0));
|
setBookIndexList(new ArrayList<>(0));
|
||||||
setBookContentList(new ArrayList<>(0));
|
setBookContentList(new ArrayList<>(0));
|
||||||
}};
|
}});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,19 +56,21 @@ public class StarterListener implements ServletContextListener {
|
|||||||
CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId());
|
CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId());
|
||||||
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
|
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
|
||||||
//解析小说基本信息
|
//解析小说基本信息
|
||||||
Book book = CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId());
|
CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(),book -> {
|
||||||
//这里只做老书更新
|
//这里只做老书更新
|
||||||
book.setId(needUpdateBook.getId());
|
book.setId(needUpdateBook.getId());
|
||||||
book.setWordCount(needUpdateBook.getWordCount());
|
book.setWordCount(needUpdateBook.getWordCount());
|
||||||
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
|
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
|
||||||
//本地图片则不更新
|
//本地图片则不更新
|
||||||
book.setPicUrl(null);
|
book.setPicUrl(null);
|
||||||
}
|
}
|
||||||
//查询已存在的章节
|
//查询已存在的章节
|
||||||
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
|
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
|
||||||
//解析章节目录
|
//解析章节目录
|
||||||
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
|
CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap,chapter -> {
|
||||||
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
|
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
|
||||||
|
});
|
||||||
|
});
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,6 @@ import com.github.pagehelper.PageHelper;
|
|||||||
import com.java2nb.novel.core.bean.PageBean;
|
import com.java2nb.novel.core.bean.PageBean;
|
||||||
import com.java2nb.novel.core.cache.CacheKey;
|
import com.java2nb.novel.core.cache.CacheKey;
|
||||||
import com.java2nb.novel.core.cache.CacheService;
|
import com.java2nb.novel.core.cache.CacheService;
|
||||||
import com.java2nb.novel.core.crawl.ChapterBean;
|
|
||||||
import com.java2nb.novel.core.crawl.CrawlParser;
|
import com.java2nb.novel.core.crawl.CrawlParser;
|
||||||
import com.java2nb.novel.core.crawl.RuleBean;
|
import com.java2nb.novel.core.crawl.RuleBean;
|
||||||
import com.java2nb.novel.core.enums.ResponseStatus;
|
import com.java2nb.novel.core.enums.ResponseStatus;
|
||||||
@ -34,6 +33,7 @@ import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -274,39 +274,46 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) {
|
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) {
|
||||||
Book book = CrawlParser.parseBook(ruleBean, bookId);
|
|
||||||
if(book.getBookName() == null || book.getAuthorName() == null){
|
final AtomicBoolean parseResult = new AtomicBoolean(false);
|
||||||
return false;
|
|
||||||
}
|
CrawlParser.parseBook(ruleBean, bookId, book -> {
|
||||||
//这里只做新书入库,查询是否存在这本书
|
if(book.getBookName() == null || book.getAuthorName() == null){
|
||||||
Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
|
return;
|
||||||
//如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库
|
|
||||||
if (existBook == null) {
|
|
||||||
//没有该书,可以入库
|
|
||||||
book.setCatId(catId);
|
|
||||||
//根据分类ID查询分类
|
|
||||||
book.setCatName(bookService.queryCatNameByCatId(catId));
|
|
||||||
if (catId == 7) {
|
|
||||||
//女频
|
|
||||||
book.setWorkDirection((byte) 1);
|
|
||||||
} else {
|
|
||||||
//男频
|
|
||||||
book.setWorkDirection((byte) 0);
|
|
||||||
}
|
}
|
||||||
book.setCrawlBookId(bookId);
|
//这里只做新书入库,查询是否存在这本书
|
||||||
book.setCrawlSourceId(sourceId);
|
Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
|
||||||
book.setCrawlLastTime(new Date());
|
//如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库
|
||||||
book.setId(new IdWorker().nextId());
|
if (existBook == null) {
|
||||||
//解析章节目录
|
//没有该书,可以入库
|
||||||
ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
|
book.setCatId(catId);
|
||||||
|
//根据分类ID查询分类
|
||||||
|
book.setCatName(bookService.queryCatNameByCatId(catId));
|
||||||
|
if (catId == 7) {
|
||||||
|
//女频
|
||||||
|
book.setWorkDirection((byte) 1);
|
||||||
|
} else {
|
||||||
|
//男频
|
||||||
|
book.setWorkDirection((byte) 0);
|
||||||
|
}
|
||||||
|
book.setCrawlBookId(bookId);
|
||||||
|
book.setCrawlSourceId(sourceId);
|
||||||
|
book.setCrawlLastTime(new Date());
|
||||||
|
book.setId(new IdWorker().nextId());
|
||||||
|
//解析章节目录
|
||||||
|
CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0),chapter -> {
|
||||||
|
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
|
||||||
|
});
|
||||||
|
|
||||||
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
|
} else {
|
||||||
|
//只更新书籍的爬虫相关字段
|
||||||
|
bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
|
||||||
|
}
|
||||||
|
parseResult.set(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
return parseResult.get();
|
||||||
|
|
||||||
} else {
|
|
||||||
//只更新书籍的爬虫相关字段
|
|
||||||
bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
Loading…
x
Reference in New Issue
Block a user