优化章节字数算法,优化爬虫代码

This commit is contained in:
xiongxiaoyang 2020-12-23 23:48:34 +08:00
parent 1046a7ffc1
commit 0a10504461
4 changed files with 66 additions and 89 deletions

View File

@ -1,9 +1,6 @@
package com.java2nb.novel.core.crawl; package com.java2nb.novel.core.crawl;
import com.java2nb.novel.core.utils.HttpUtil; import com.java2nb.novel.core.utils.*;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.core.utils.RandomBookInfoUtil;
import com.java2nb.novel.core.utils.RestTemplateUtil;
import com.java2nb.novel.entity.Book; import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent; import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex; import com.java2nb.novel.entity.BookIndex;
@ -37,7 +34,7 @@ public class CrawlParser {
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
private static ThreadLocal <Integer> retryCount = new ThreadLocal<>(); private static ThreadLocal<Integer> retryCount = new ThreadLocal<>();
@SneakyThrows @SneakyThrows
public static Book parseBook(RuleBean ruleBean, String bookId) { public static Book parseBook(RuleBean ruleBean, String bookId) {
@ -65,7 +62,7 @@ public class CrawlParser {
boolean isFindPicUrl = picUrlMatch.find(); boolean isFindPicUrl = picUrlMatch.find();
if (isFindPicUrl) { if (isFindPicUrl) {
String picUrl = picUrlMatch.group(1); String picUrl = picUrlMatch.group(1);
if(StringUtils.isNotBlank(picUrl) && StringUtils.isNotBlank(ruleBean.getPicUrlPrefix())) { if (StringUtils.isNotBlank(picUrl) && StringUtils.isNotBlank(ruleBean.getPicUrlPrefix())) {
picUrl = ruleBean.getPicUrlPrefix() + picUrl; picUrl = ruleBean.getPicUrlPrefix() + picUrl;
} }
//设置封面图片路径 //设置封面图片路径
@ -96,11 +93,11 @@ public class CrawlParser {
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length()); String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd())); desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
//过滤掉简介中的特殊标签 //过滤掉简介中的特殊标签
desc = desc.replaceAll("<a[^<]+</a>","") desc = desc.replaceAll("<a[^<]+</a>", "")
.replaceAll("<font[^<]+</font>","") .replaceAll("<font[^<]+</font>", "")
.replaceAll("<p>\\s*</p>","") .replaceAll("<p>\\s*</p>", "")
.replaceAll("<p>","") .replaceAll("<p>", "")
.replaceAll("</p>","<br/>"); .replaceAll("</p>", "<br/>");
//设置书籍简介 //设置书籍简介
book.setBookDesc(desc); book.setBookDesc(desc);
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) { if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
@ -146,9 +143,9 @@ public class CrawlParser {
} }
public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> hasIndexs) { public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> hasIndexs) {
Map<Integer,List> result = new HashMap<>(2); Map<Integer, List> result = new HashMap<>(2);
result.put(BOOK_INDEX_LIST_KEY,new ArrayList(0)); result.put(BOOK_INDEX_LIST_KEY, new ArrayList(0));
result.put(BOOK_CONTENT_LIST_KEY,new ArrayList(0)); result.put(BOOK_CONTENT_LIST_KEY, new ArrayList(0));
Date currentDate = new Date(); Date currentDate = new Date();
@ -159,7 +156,7 @@ public class CrawlParser {
String indexListHtml = getByHttpClientWithChrome(indexListUrl); String indexListHtml = getByHttpClientWithChrome(indexListUrl);
if (indexListHtml != null) { if (indexListHtml != null) {
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){ if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
} }
@ -189,12 +186,12 @@ public class CrawlParser {
String sourceIndexId = indexIdMatch.group(1); String sourceIndexId = indexIdMatch.group(1);
String bookContentUrl = ruleBean.getBookContentUrl(); String bookContentUrl = ruleBean.getBookContentUrl();
int calStart = bookContentUrl.indexOf("{cal_"); int calStart = bookContentUrl.indexOf("{cal_");
if(calStart != -1){ if (calStart != -1) {
//内容页URL需要进行计算才能得到 //内容页URL需要进行计算才能得到
String calStr = bookContentUrl.substring(calStart,calStart+bookContentUrl.substring(calStart).indexOf("}")); String calStr = bookContentUrl.substring(calStart, calStart + bookContentUrl.substring(calStart).indexOf("}"));
String[] calArr = calStr.split("_"); String[] calArr = calStr.split("_");
int calType = Integer.parseInt(calArr[1]); int calType = Integer.parseInt(calArr[1]);
if(calType == 1) { if (calType == 1) {
///{cal_1_1_3}_{bookId}/{indexId}.html ///{cal_1_1_3}_{bookId}/{indexId}.html
//第一种计算规则去除第x个参数的最后y个字母 //第一种计算规则去除第x个参数的最后y个字母
int x = Integer.parseInt(calArr[2]); int x = Integer.parseInt(calArr[2]);
@ -206,12 +203,12 @@ public class CrawlParser {
calResult = sourceIndexId.substring(0, sourceBookId.length() - y); calResult = sourceIndexId.substring(0, sourceBookId.length() - y);
} }
if(calResult.length() == 0){ if (calResult.length() == 0) {
calResult = "0"; calResult = "0";
} }
bookContentUrl = bookContentUrl.replace(calStr+"}", calResult); bookContentUrl = bookContentUrl.replace(calStr + "}", calResult);
} }
} }
@ -223,52 +220,40 @@ public class CrawlParser {
if (contentHtml != null && !contentHtml.contains("正在手打中")) { if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
content = content.substring(0, content.indexOf(ruleBean.getContentEnd())); content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
//TODO插入章节目录和章节内容 //插入章节目录和章节内容
BookIndex bookIndex = new BookIndex(); BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName); bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum); bookIndex.setIndexNum(indexNum);
Integer wordCount = StringUtil.getStrValidWordCount(content);
bookIndex.setWordCount(wordCount);
indexList.add(bookIndex); indexList.add(bookIndex);
BookContent bookContent = new BookContent();
BookContent bookContent = new BookContent();
bookContent.setContent(content); bookContent.setContent(content);
contentList.add(bookContent); contentList.add(bookContent);
//判断是新增还是更新
if(hasIndexs.size() == 0){ if (hasIndex != null) {
//新书入库 //章节更新
bookIndex.setId(hasIndex.getId());
bookContent.setIndexId(hasIndex.getId());
} else {
//章节插入
//设置目录和章节内容 //设置目录和章节内容
Long indexId = idWorker.nextId(); Long indexId = idWorker.nextId();
lastIndexId = indexId; lastIndexId = indexId;
lastIndexName = indexName; lastIndexName = indexName;
bookIndex.setId(indexId); bookIndex.setId(indexId);
bookIndex.setBookId(book.getId()); bookIndex.setBookId(book.getId());
Integer wordCount = bookContent.getContent().length();
totalWordCount += wordCount;
bookIndex.setWordCount(wordCount);
bookIndex.setCreateTime(currentDate); bookIndex.setCreateTime(currentDate);
bookIndex.setUpdateTime(currentDate);
bookContent.setIndexId(indexId); bookContent.setIndexId(indexId);
//设置小说基础信息
book.setWordCount(totalWordCount);
book.setLastIndexId(lastIndexId);
book.setLastIndexName(lastIndexName);
book.setLastIndexUpdateTime(currentDate);
book.setCreateTime(currentDate);
book.setUpdateTime(currentDate);
}else{
//老书更新
} }
bookIndex.setUpdateTime(currentDate);
//计算总字数
totalWordCount += wordCount;
if(hasIndex != null){
bookIndex.setId(hasIndex.getId());
bookContent.setIndexId(hasIndex.getId());
}
} }
@ -279,15 +264,30 @@ public class CrawlParser {
isFindIndex = indexIdMatch.find() & indexNameMatch.find(); isFindIndex = indexIdMatch.find() & indexNameMatch.find();
} }
//判断是新书入库还是老书更新
if (hasIndexs.size() == 0) {
//新书入库
//设置小说基础信息
book.setWordCount(totalWordCount);
book.setLastIndexId(lastIndexId);
book.setLastIndexName(lastIndexName);
book.setLastIndexUpdateTime(currentDate);
book.setCreateTime(currentDate);
}
book.setUpdateTime(currentDate);
if (indexList.size() == contentList.size() && indexList.size() > 0) { if (indexList.size() == contentList.size() && indexList.size() > 0) {
result.put(BOOK_INDEX_LIST_KEY,indexList); result.put(BOOK_INDEX_LIST_KEY, indexList);
result.put(BOOK_CONTENT_LIST_KEY,contentList); result.put(BOOK_CONTENT_LIST_KEY, contentList);
} }
} }
return result; return result;
} }
@ -297,7 +297,7 @@ public class CrawlParser {
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class); ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) { if (forEntity.getStatusCode() == HttpStatus.OK) {
String body = forEntity.getBody(); String body = forEntity.getBody();
if(body.length() < Constants.INVALID_HTML_LENGTH){ if (body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url); return processErrorHttpResult(url);
} }
//成功获得html内容 //成功获得html内容
@ -314,7 +314,7 @@ public class CrawlParser {
try { try {
String body = HttpUtil.getByHttpClientWithChrome(url); String body = HttpUtil.getByHttpClientWithChrome(url);
if(body != null && body.length() < Constants.INVALID_HTML_LENGTH){ if (body != null && body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url); return processErrorHttpResult(url);
} }
//成功获得html内容 //成功获得html内容
@ -327,13 +327,13 @@ public class CrawlParser {
} }
@SneakyThrows @SneakyThrows
private static String processErrorHttpResult(String url){ private static String processErrorHttpResult(String url) {
Integer count = retryCount.get(); Integer count = retryCount.get();
if(count == null){ if (count == null) {
count = 0; count = 0;
} }
if(count < Constants.HTTP_FAIL_RETRY_COUNT){ if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
Thread.sleep( new Random().nextInt(10*1000)); Thread.sleep(new Random().nextInt(10 * 1000));
retryCount.set(++count); retryCount.set(++count);
return getByHttpClient(url); return getByHttpClient(url);
} }

View File

@ -65,8 +65,7 @@ public interface BookService {
* @param book 小说数据 * @param book 小说数据
* @param bookIndexList 目录集合 * @param bookIndexList 目录集合
* @param bookContentList 内容集合 * @param bookContentList 内容集合
* @param existBookIndexMap 已存在的章节Map * @param existBookIndexMap 已存在的章节Map */
* */
void updateBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList, Map<Integer, BookIndex> existBookIndexMap); void updateBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList, Map<Integer, BookIndex> existBookIndexMap);
/** /**

View File

@ -1,6 +1,5 @@
package com.java2nb.novel.service.impl; package com.java2nb.novel.service.impl;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.entity.Book; import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent; import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex; import com.java2nb.novel.entity.BookIndex;
@ -79,10 +78,6 @@ public class BookServiceImpl implements BookService {
if(bookIndexList.size()>0) { if(bookIndexList.size()>0) {
if (book.getId() == null) {
book.setId(new IdWorker().nextId());
}
//保存小说主表 //保存小说主表
bookMapper.insertSelective(book); bookMapper.insertSelective(book);
@ -128,30 +123,14 @@ public class BookServiceImpl implements BookService {
BookIndex bookIndex = bookIndexList.get(i); BookIndex bookIndex = bookIndexList.get(i);
BookContent bookContent = bookContentList.get(i); BookContent bookContent = bookContentList.get(i);
//插入或更新目录
Integer wordCount = bookContent.getContent().length();
bookIndex.setWordCount(wordCount);
bookIndex.setUpdateTime(currentDate);
if(bookIndex.getId() == null) { if(!existBookIndexMap.containsKey(bookIndex.getIndexNum())) {
//插入 //插入
bookIndex.setBookId(book.getId());
Long indexId = new IdWorker().nextId();
bookIndex.setId(indexId);
bookIndex.setCreateTime(currentDate);
bookIndexMapper.insertSelective(bookIndex); bookIndexMapper.insertSelective(bookIndex);
}else{
//更新
bookIndexMapper.updateByPrimaryKeySelective(bookIndex);
}
if(bookContent.getIndexId() == null) {
//插入
bookContent.setIndexId(bookIndex.getId());
bookContentMapper.insertSelective(bookContent); bookContentMapper.insertSelective(bookContent);
}else{ }else{
//更新 //更新
bookIndexMapper.updateByPrimaryKeySelective(bookIndex);
bookContentMapper.update(update(BookContentDynamicSqlSupport.bookContent) bookContentMapper.update(update(BookContentDynamicSqlSupport.bookContent)
.set(BookContentDynamicSqlSupport.content) .set(BookContentDynamicSqlSupport.content)
.equalTo(bookContent.getContent()) .equalTo(bookContent.getContent())
@ -160,6 +139,7 @@ public class BookServiceImpl implements BookService {
.render(RenderingStrategies.MYBATIS3)); .render(RenderingStrategies.MYBATIS3));
} }
} }
//更新小说主表 //更新小说主表
@ -174,7 +154,6 @@ public class BookServiceImpl implements BookService {
book.setLastIndexUpdateTime(currentDate); book.setLastIndexUpdateTime(currentDate);
} }
} }
book.setUpdateTime(currentDate);
book.setBookName(null); book.setBookName(null);
book.setAuthorName(null); book.setAuthorName(null);
if(Constants.VISIT_COUNT_DEFAULT.equals(book.getVisitCount())) { if(Constants.VISIT_COUNT_DEFAULT.equals(book.getVisitCount())) {

View File

@ -15,8 +15,7 @@
<select id="queryTotalWordCount" parameterType="long" resultType="int"> <select id="queryTotalWordCount" parameterType="long" resultType="int">
select sum(t2.word_count) from book t1 inner join book_index t2 select sum(word_count) from book_index where book_id = #{bookId}
on t1.id = t2.book_id and t1.id = #{bookId}
</select> </select>
<update id="updateCrawlLastTime"> <update id="updateCrawlLastTime">