mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-27 01:30:51 +00:00
优化章节字数算法,优化爬虫代码
This commit is contained in:
parent
1046a7ffc1
commit
0a10504461
@ -1,9 +1,6 @@
|
|||||||
package com.java2nb.novel.core.crawl;
|
package com.java2nb.novel.core.crawl;
|
||||||
|
|
||||||
import com.java2nb.novel.core.utils.HttpUtil;
|
import com.java2nb.novel.core.utils.*;
|
||||||
import com.java2nb.novel.core.utils.IdWorker;
|
|
||||||
import com.java2nb.novel.core.utils.RandomBookInfoUtil;
|
|
||||||
import com.java2nb.novel.core.utils.RestTemplateUtil;
|
|
||||||
import com.java2nb.novel.entity.Book;
|
import com.java2nb.novel.entity.Book;
|
||||||
import com.java2nb.novel.entity.BookContent;
|
import com.java2nb.novel.entity.BookContent;
|
||||||
import com.java2nb.novel.entity.BookIndex;
|
import com.java2nb.novel.entity.BookIndex;
|
||||||
@ -37,7 +34,7 @@ public class CrawlParser {
|
|||||||
|
|
||||||
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
||||||
|
|
||||||
private static ThreadLocal <Integer> retryCount = new ThreadLocal<>();
|
private static ThreadLocal<Integer> retryCount = new ThreadLocal<>();
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
||||||
@ -65,7 +62,7 @@ public class CrawlParser {
|
|||||||
boolean isFindPicUrl = picUrlMatch.find();
|
boolean isFindPicUrl = picUrlMatch.find();
|
||||||
if (isFindPicUrl) {
|
if (isFindPicUrl) {
|
||||||
String picUrl = picUrlMatch.group(1);
|
String picUrl = picUrlMatch.group(1);
|
||||||
if(StringUtils.isNotBlank(picUrl) && StringUtils.isNotBlank(ruleBean.getPicUrlPrefix())) {
|
if (StringUtils.isNotBlank(picUrl) && StringUtils.isNotBlank(ruleBean.getPicUrlPrefix())) {
|
||||||
picUrl = ruleBean.getPicUrlPrefix() + picUrl;
|
picUrl = ruleBean.getPicUrlPrefix() + picUrl;
|
||||||
}
|
}
|
||||||
//设置封面图片路径
|
//设置封面图片路径
|
||||||
@ -96,11 +93,11 @@ public class CrawlParser {
|
|||||||
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
|
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
|
||||||
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
|
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
|
||||||
//过滤掉简介中的特殊标签
|
//过滤掉简介中的特殊标签
|
||||||
desc = desc.replaceAll("<a[^<]+</a>","")
|
desc = desc.replaceAll("<a[^<]+</a>", "")
|
||||||
.replaceAll("<font[^<]+</font>","")
|
.replaceAll("<font[^<]+</font>", "")
|
||||||
.replaceAll("<p>\\s*</p>","")
|
.replaceAll("<p>\\s*</p>", "")
|
||||||
.replaceAll("<p>","")
|
.replaceAll("<p>", "")
|
||||||
.replaceAll("</p>","<br/>");
|
.replaceAll("</p>", "<br/>");
|
||||||
//设置书籍简介
|
//设置书籍简介
|
||||||
book.setBookDesc(desc);
|
book.setBookDesc(desc);
|
||||||
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
|
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
|
||||||
@ -146,9 +143,9 @@ public class CrawlParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> hasIndexs) {
|
public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> hasIndexs) {
|
||||||
Map<Integer,List> result = new HashMap<>(2);
|
Map<Integer, List> result = new HashMap<>(2);
|
||||||
result.put(BOOK_INDEX_LIST_KEY,new ArrayList(0));
|
result.put(BOOK_INDEX_LIST_KEY, new ArrayList(0));
|
||||||
result.put(BOOK_CONTENT_LIST_KEY,new ArrayList(0));
|
result.put(BOOK_CONTENT_LIST_KEY, new ArrayList(0));
|
||||||
|
|
||||||
Date currentDate = new Date();
|
Date currentDate = new Date();
|
||||||
|
|
||||||
@ -159,7 +156,7 @@ public class CrawlParser {
|
|||||||
String indexListHtml = getByHttpClientWithChrome(indexListUrl);
|
String indexListHtml = getByHttpClientWithChrome(indexListUrl);
|
||||||
|
|
||||||
if (indexListHtml != null) {
|
if (indexListHtml != null) {
|
||||||
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
|
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
|
||||||
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
|
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -189,12 +186,12 @@ public class CrawlParser {
|
|||||||
String sourceIndexId = indexIdMatch.group(1);
|
String sourceIndexId = indexIdMatch.group(1);
|
||||||
String bookContentUrl = ruleBean.getBookContentUrl();
|
String bookContentUrl = ruleBean.getBookContentUrl();
|
||||||
int calStart = bookContentUrl.indexOf("{cal_");
|
int calStart = bookContentUrl.indexOf("{cal_");
|
||||||
if(calStart != -1){
|
if (calStart != -1) {
|
||||||
//内容页URL需要进行计算才能得到
|
//内容页URL需要进行计算才能得到
|
||||||
String calStr = bookContentUrl.substring(calStart,calStart+bookContentUrl.substring(calStart).indexOf("}"));
|
String calStr = bookContentUrl.substring(calStart, calStart + bookContentUrl.substring(calStart).indexOf("}"));
|
||||||
String[] calArr = calStr.split("_");
|
String[] calArr = calStr.split("_");
|
||||||
int calType = Integer.parseInt(calArr[1]);
|
int calType = Integer.parseInt(calArr[1]);
|
||||||
if(calType == 1) {
|
if (calType == 1) {
|
||||||
///{cal_1_1_3}_{bookId}/{indexId}.html
|
///{cal_1_1_3}_{bookId}/{indexId}.html
|
||||||
//第一种计算规则,去除第x个参数的最后y个字母
|
//第一种计算规则,去除第x个参数的最后y个字母
|
||||||
int x = Integer.parseInt(calArr[2]);
|
int x = Integer.parseInt(calArr[2]);
|
||||||
@ -206,12 +203,12 @@ public class CrawlParser {
|
|||||||
calResult = sourceIndexId.substring(0, sourceBookId.length() - y);
|
calResult = sourceIndexId.substring(0, sourceBookId.length() - y);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(calResult.length() == 0){
|
if (calResult.length() == 0) {
|
||||||
calResult = "0";
|
calResult = "0";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bookContentUrl = bookContentUrl.replace(calStr+"}", calResult);
|
bookContentUrl = bookContentUrl.replace(calStr + "}", calResult);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -223,52 +220,40 @@ public class CrawlParser {
|
|||||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||||
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||||
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
||||||
//TODO插入章节目录和章节内容
|
//插入章节目录和章节内容
|
||||||
BookIndex bookIndex = new BookIndex();
|
BookIndex bookIndex = new BookIndex();
|
||||||
|
|
||||||
bookIndex.setIndexName(indexName);
|
bookIndex.setIndexName(indexName);
|
||||||
bookIndex.setIndexNum(indexNum);
|
bookIndex.setIndexNum(indexNum);
|
||||||
|
Integer wordCount = StringUtil.getStrValidWordCount(content);
|
||||||
|
bookIndex.setWordCount(wordCount);
|
||||||
indexList.add(bookIndex);
|
indexList.add(bookIndex);
|
||||||
BookContent bookContent = new BookContent();
|
|
||||||
|
|
||||||
|
BookContent bookContent = new BookContent();
|
||||||
bookContent.setContent(content);
|
bookContent.setContent(content);
|
||||||
contentList.add(bookContent);
|
contentList.add(bookContent);
|
||||||
|
|
||||||
//判断是新增还是更新
|
|
||||||
if(hasIndexs.size() == 0){
|
if (hasIndex != null) {
|
||||||
//新书入库
|
//章节更新
|
||||||
|
bookIndex.setId(hasIndex.getId());
|
||||||
|
bookContent.setIndexId(hasIndex.getId());
|
||||||
|
} else {
|
||||||
|
//章节插入
|
||||||
//设置目录和章节内容
|
//设置目录和章节内容
|
||||||
Long indexId = idWorker.nextId();
|
Long indexId = idWorker.nextId();
|
||||||
lastIndexId = indexId;
|
lastIndexId = indexId;
|
||||||
lastIndexName = indexName;
|
lastIndexName = indexName;
|
||||||
bookIndex.setId(indexId);
|
bookIndex.setId(indexId);
|
||||||
bookIndex.setBookId(book.getId());
|
bookIndex.setBookId(book.getId());
|
||||||
Integer wordCount = bookContent.getContent().length();
|
|
||||||
totalWordCount += wordCount;
|
|
||||||
bookIndex.setWordCount(wordCount);
|
|
||||||
bookIndex.setCreateTime(currentDate);
|
bookIndex.setCreateTime(currentDate);
|
||||||
bookIndex.setUpdateTime(currentDate);
|
|
||||||
|
|
||||||
bookContent.setIndexId(indexId);
|
bookContent.setIndexId(indexId);
|
||||||
|
|
||||||
//设置小说基础信息
|
|
||||||
book.setWordCount(totalWordCount);
|
|
||||||
book.setLastIndexId(lastIndexId);
|
|
||||||
book.setLastIndexName(lastIndexName);
|
|
||||||
book.setLastIndexUpdateTime(currentDate);
|
|
||||||
book.setCreateTime(currentDate);
|
|
||||||
book.setUpdateTime(currentDate);
|
|
||||||
|
|
||||||
}else{
|
|
||||||
//老书更新
|
|
||||||
}
|
}
|
||||||
|
bookIndex.setUpdateTime(currentDate);
|
||||||
|
|
||||||
|
//计算总字数
|
||||||
|
totalWordCount += wordCount;
|
||||||
if(hasIndex != null){
|
|
||||||
bookIndex.setId(hasIndex.getId());
|
|
||||||
bookContent.setIndexId(hasIndex.getId());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -279,15 +264,30 @@ public class CrawlParser {
|
|||||||
isFindIndex = indexIdMatch.find() & indexNameMatch.find();
|
isFindIndex = indexIdMatch.find() & indexNameMatch.find();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//判断是新书入库还是老书更新
|
||||||
|
if (hasIndexs.size() == 0) {
|
||||||
|
//新书入库
|
||||||
|
|
||||||
|
//设置小说基础信息
|
||||||
|
book.setWordCount(totalWordCount);
|
||||||
|
book.setLastIndexId(lastIndexId);
|
||||||
|
book.setLastIndexName(lastIndexName);
|
||||||
|
book.setLastIndexUpdateTime(currentDate);
|
||||||
|
book.setCreateTime(currentDate);
|
||||||
|
|
||||||
|
}
|
||||||
|
book.setUpdateTime(currentDate);
|
||||||
|
|
||||||
if (indexList.size() == contentList.size() && indexList.size() > 0) {
|
if (indexList.size() == contentList.size() && indexList.size() > 0) {
|
||||||
|
|
||||||
result.put(BOOK_INDEX_LIST_KEY,indexList);
|
result.put(BOOK_INDEX_LIST_KEY, indexList);
|
||||||
result.put(BOOK_CONTENT_LIST_KEY,contentList);
|
result.put(BOOK_CONTENT_LIST_KEY, contentList);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -297,7 +297,7 @@ public class CrawlParser {
|
|||||||
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
|
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
|
||||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||||
String body = forEntity.getBody();
|
String body = forEntity.getBody();
|
||||||
if(body.length() < Constants.INVALID_HTML_LENGTH){
|
if (body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||||
return processErrorHttpResult(url);
|
return processErrorHttpResult(url);
|
||||||
}
|
}
|
||||||
//成功获得html内容
|
//成功获得html内容
|
||||||
@ -314,7 +314,7 @@ public class CrawlParser {
|
|||||||
try {
|
try {
|
||||||
|
|
||||||
String body = HttpUtil.getByHttpClientWithChrome(url);
|
String body = HttpUtil.getByHttpClientWithChrome(url);
|
||||||
if(body != null && body.length() < Constants.INVALID_HTML_LENGTH){
|
if (body != null && body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||||
return processErrorHttpResult(url);
|
return processErrorHttpResult(url);
|
||||||
}
|
}
|
||||||
//成功获得html内容
|
//成功获得html内容
|
||||||
@ -327,13 +327,13 @@ public class CrawlParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private static String processErrorHttpResult(String url){
|
private static String processErrorHttpResult(String url) {
|
||||||
Integer count = retryCount.get();
|
Integer count = retryCount.get();
|
||||||
if(count == null){
|
if (count == null) {
|
||||||
count = 0;
|
count = 0;
|
||||||
}
|
}
|
||||||
if(count < Constants.HTTP_FAIL_RETRY_COUNT){
|
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
|
||||||
Thread.sleep( new Random().nextInt(10*1000));
|
Thread.sleep(new Random().nextInt(10 * 1000));
|
||||||
retryCount.set(++count);
|
retryCount.set(++count);
|
||||||
return getByHttpClient(url);
|
return getByHttpClient(url);
|
||||||
}
|
}
|
||||||
|
@ -65,8 +65,7 @@ public interface BookService {
|
|||||||
* @param book 小说数据
|
* @param book 小说数据
|
||||||
* @param bookIndexList 目录集合
|
* @param bookIndexList 目录集合
|
||||||
* @param bookContentList 内容集合
|
* @param bookContentList 内容集合
|
||||||
* @param existBookIndexMap 已存在的章节Map
|
* @param existBookIndexMap 已存在的章节Map */
|
||||||
* */
|
|
||||||
void updateBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList, Map<Integer, BookIndex> existBookIndexMap);
|
void updateBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList, Map<Integer, BookIndex> existBookIndexMap);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
package com.java2nb.novel.service.impl;
|
package com.java2nb.novel.service.impl;
|
||||||
|
|
||||||
import com.java2nb.novel.core.utils.IdWorker;
|
|
||||||
import com.java2nb.novel.entity.Book;
|
import com.java2nb.novel.entity.Book;
|
||||||
import com.java2nb.novel.entity.BookContent;
|
import com.java2nb.novel.entity.BookContent;
|
||||||
import com.java2nb.novel.entity.BookIndex;
|
import com.java2nb.novel.entity.BookIndex;
|
||||||
@ -79,10 +78,6 @@ public class BookServiceImpl implements BookService {
|
|||||||
|
|
||||||
if(bookIndexList.size()>0) {
|
if(bookIndexList.size()>0) {
|
||||||
|
|
||||||
if (book.getId() == null) {
|
|
||||||
book.setId(new IdWorker().nextId());
|
|
||||||
}
|
|
||||||
|
|
||||||
//保存小说主表
|
//保存小说主表
|
||||||
|
|
||||||
bookMapper.insertSelective(book);
|
bookMapper.insertSelective(book);
|
||||||
@ -128,30 +123,14 @@ public class BookServiceImpl implements BookService {
|
|||||||
BookIndex bookIndex = bookIndexList.get(i);
|
BookIndex bookIndex = bookIndexList.get(i);
|
||||||
BookContent bookContent = bookContentList.get(i);
|
BookContent bookContent = bookContentList.get(i);
|
||||||
|
|
||||||
//插入或更新目录
|
|
||||||
Integer wordCount = bookContent.getContent().length();
|
|
||||||
bookIndex.setWordCount(wordCount);
|
|
||||||
bookIndex.setUpdateTime(currentDate);
|
|
||||||
|
|
||||||
if(bookIndex.getId() == null) {
|
if(!existBookIndexMap.containsKey(bookIndex.getIndexNum())) {
|
||||||
//插入
|
//插入
|
||||||
bookIndex.setBookId(book.getId());
|
|
||||||
Long indexId = new IdWorker().nextId();
|
|
||||||
bookIndex.setId(indexId);
|
|
||||||
bookIndex.setCreateTime(currentDate);
|
|
||||||
bookIndexMapper.insertSelective(bookIndex);
|
bookIndexMapper.insertSelective(bookIndex);
|
||||||
}else{
|
|
||||||
//更新
|
|
||||||
bookIndexMapper.updateByPrimaryKeySelective(bookIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(bookContent.getIndexId() == null) {
|
|
||||||
//插入
|
|
||||||
bookContent.setIndexId(bookIndex.getId());
|
|
||||||
bookContentMapper.insertSelective(bookContent);
|
bookContentMapper.insertSelective(bookContent);
|
||||||
}else{
|
}else{
|
||||||
//更新
|
//更新
|
||||||
|
bookIndexMapper.updateByPrimaryKeySelective(bookIndex);
|
||||||
bookContentMapper.update(update(BookContentDynamicSqlSupport.bookContent)
|
bookContentMapper.update(update(BookContentDynamicSqlSupport.bookContent)
|
||||||
.set(BookContentDynamicSqlSupport.content)
|
.set(BookContentDynamicSqlSupport.content)
|
||||||
.equalTo(bookContent.getContent())
|
.equalTo(bookContent.getContent())
|
||||||
@ -160,6 +139,7 @@ public class BookServiceImpl implements BookService {
|
|||||||
.render(RenderingStrategies.MYBATIS3));
|
.render(RenderingStrategies.MYBATIS3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//更新小说主表
|
//更新小说主表
|
||||||
@ -174,7 +154,6 @@ public class BookServiceImpl implements BookService {
|
|||||||
book.setLastIndexUpdateTime(currentDate);
|
book.setLastIndexUpdateTime(currentDate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
book.setUpdateTime(currentDate);
|
|
||||||
book.setBookName(null);
|
book.setBookName(null);
|
||||||
book.setAuthorName(null);
|
book.setAuthorName(null);
|
||||||
if(Constants.VISIT_COUNT_DEFAULT.equals(book.getVisitCount())) {
|
if(Constants.VISIT_COUNT_DEFAULT.equals(book.getVisitCount())) {
|
||||||
|
@ -15,8 +15,7 @@
|
|||||||
|
|
||||||
<select id="queryTotalWordCount" parameterType="long" resultType="int">
|
<select id="queryTotalWordCount" parameterType="long" resultType="int">
|
||||||
|
|
||||||
select sum(t2.word_count) from book t1 inner join book_index t2
|
select sum(word_count) from book_index where book_id = #{bookId}
|
||||||
on t1.id = t2.book_id and t1.id = #{bookId}
|
|
||||||
</select>
|
</select>
|
||||||
|
|
||||||
<update id="updateCrawlLastTime">
|
<update id="updateCrawlLastTime">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user