爬虫自动更新程序优化,增加自动修复错误章节

This commit is contained in:
xiongxiaoyang 2019-12-19 11:26:56 +08:00
parent 8f5f141316
commit eac0ce9302
6 changed files with 158 additions and 154 deletions

View File

@ -522,88 +522,85 @@ public class BookCrawlServiceImpl implements BookCrawlService {
if (picMather.find()) {
String picSrc = picMather.group(1);
Pattern descPatten = compile("class=\"review\">([^<]+)</p>");
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
String desc = body.substring(body.indexOf("<p class=\"review\">") + "<p class=\"review\">".length());
desc = desc.substring(0, desc.indexOf("</p>"));
BookDO book = new BookDO();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
BookDO book = new BookDO();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List<BookIndexDO> indexList = new ArrayList<>();
List<BookContentDO> contentList = new ArrayList<>();
List<BookIndexDO> indexList = new ArrayList<>();
List<BookContentDO> contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = compile("<a\\s+href=\"(/du/\\d+_\\d+/)\">查看完整目录</a>");
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = baseUrl + indexMatch.group(1);
String body2 = getByTemplate(indexUrl);
if (body2 != null) {
Pattern indexListPatten = compile("<a\\s+style=\"\"\\s+href=\"(/\\d+_\\d+/\\d+\\.html)\">([^/]+)</a>");
Matcher indexListMatch = indexListPatten.matcher(body2);
//读取目录
Pattern indexPatten = compile("<a\\s+href=\"(/du/\\d+_\\d+/)\">查看完整目录</a>");
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = baseUrl + indexMatch.group(1);
String body2 = getByTemplate(indexUrl);
if (body2 != null) {
Pattern indexListPatten = compile("<a\\s+style=\"\"\\s+href=\"(/\\d+_\\d+/\\d+\\.html)\">([^/]+)</a>");
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List<Integer> hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author);
int indexNum = 0;
//查询该书籍已存在目录号
List<Integer> hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author);
while (isFindIndex) {
if (isInteruptBiquTaCrawl) {
return;
}
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = baseUrl + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = getByTemplate(contentUrl.replace("//m.","//www."));
if (body3 != null) {
String start = "id=\"content\">";
String end = "<script>";
String content = body3.substring(body3.indexOf(start) + start.length());
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">"+content.substring(0,content.indexOf(end))+"</div>";
//TODO插入章节目录和章节内容
BookIndexDO bookIndex = new BookIndexDO();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContentDO bookContent = new BookContentDO();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
}
}
indexNum++;
isFindIndex = indexListMatch.find();
while (isFindIndex) {
if (isInteruptBiquTaCrawl) {
return;
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
bookService.saveBookAndIndexAndContent(book, indexList, contentList);
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = baseUrl + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = getByTemplate(contentUrl.replace("//m.", "//www."));
if (body3 != null) {
String start = "id=\"content\">";
String end = "<script>";
String content = body3.substring(body3.indexOf(start) + start.length());
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">" + content.substring(0, content.indexOf(end)) + "</div>";
//TODO插入章节目录和章节内容
BookIndexDO bookIndex = new BookIndexDO();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContentDO bookContent = new BookContentDO();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
}
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
bookService.saveBookAndIndexAndContent(book, indexList, contentList);
}
}
}
}
}
@ -857,9 +854,6 @@ public class BookCrawlServiceImpl implements BookCrawlService {
}
private String getByTemplate(String catBookListUrl) {
try {
ResponseEntity<String> forEntity = restTemplate.getForEntity(catBookListUrl, String.class);

View File

@ -1,6 +1,7 @@
package xyz.zinglizingli.books.core.crawl;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import xyz.zinglizingli.books.po.Book;
import xyz.zinglizingli.books.po.BookContent;
@ -14,6 +15,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -94,96 +96,89 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
String picSrc = picMather.group(1);
String desc = body.substring(body.indexOf("<p class=\"review\">") + "<p class=\"review\">".length());
desc = desc.substring(0, desc.indexOf("</p>"));
Pattern descPatten = compile(getIntroPattern());
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List<BookIndex> indexList = new ArrayList<>();
List<BookContent> contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = compile(getCatalogUrlPattern());
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = getIndexUrl() + indexMatch.group(1);
String body2 = RestTemplateUtil.getBodyByUtf8(indexUrl);
if (body2 != null) {
Pattern indexListPatten = compile(getCatalogPattern());
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List<Integer> hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author);
//更新和插入分别开插入只在凌晨做一次
if (hasIndexNum.size() > 0) {
while (isFindIndex) {
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = getIndexUrl() + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = RestTemplateUtil.getBodyByUtf8(contentUrl.replace("//m.","//www.").replace("//wap.","//www."));
if (body3 != null) {
String start = "id=\"content\">";
String end = "<script>";
String content = body3.substring(body3.indexOf(start) + start.length());
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">"+content.substring(0,content.indexOf(end))+"</div>";
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List<BookIndex> indexList = new ArrayList<>();
List<BookContent> contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = compile(getCatalogUrlPattern());
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = getIndexUrl() + indexMatch.group(1);
String body2 = RestTemplateUtil.getBodyByUtf8(indexUrl);
if (body2 != null) {
Pattern indexListPatten = compile(getCatalogPattern());
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
Map<Integer, BookIndex> hasIndexs = bookService.queryIndexByBookNameAndAuthor(bookName, author);
//更新和插入分别开此处只做更新
if (hasIndexs.size() > 0) {
while (isFindIndex) {
BookIndex hasIndex = hasIndexs.get(indexNum);
String indexName = indexListMatch.group(2);
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) {
String contentUrl = getIndexUrl() + indexListMatch.group(1);
//查询章节内容
String body3 = RestTemplateUtil.getBodyByUtf8(contentUrl.replace("//m.", "//www.").replace("//wap.", "//www."));
if (body3 != null) {
String start = "id=\"content\">";
String end = "<script>";
String content = body3.substring(body3.indexOf(start) + start.length());
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">" + content.substring(0, content.indexOf(end)) + "</div>";
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
ExcutorUtils.excuteFixedTask(() ->
bookService.saveBookAndIndexAndContent(book, indexList, contentList)
);
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
bookService.saveBookAndIndexAndContent(book, indexList, contentList);
}
}
}
}
}
}
}
}
@ -212,5 +207,4 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
}
}

View File

@ -30,6 +30,7 @@ import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
@ -109,13 +110,13 @@ public class BookService {
newBookIndexList.add(bookIndexItem);
newContentList.add(bookContentItem);
}
//一次最多只允许插入20条记录,否则影响服务器响应
if (isUpdate && i % 20 == 0 && newBookIndexList.size() > 0) {
//一次最多只允许插入100条记录,否则影响服务器响应
if (isUpdate && i % 100 == 0 && newBookIndexList.size() > 0) {
bookService.insertIndexListAndContentList(newBookIndexList, newContentList);
newBookIndexList = new ArrayList<>();
newContentList = new ArrayList<>();
try {
Thread.sleep(1000 * 60 * 5);
Thread.sleep(1000 * 60 * 1);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
throw new RuntimeException(e.getMessage());
@ -175,12 +176,27 @@ public class BookService {
}
/**
* 批量插入章节目录表和章节内容表
* 批量插入章节目录表和章节内容表自动修复错误章节
* */
@Transactional(rollbackFor = Exception.class)
public void insertIndexListAndContentList(List<BookIndex> newBookIndexList, List<BookContent> newContentList) {
bookIndexMapper.insertBatch(newBookIndexList);
bookContentMapper.insertBatch(newContentList);
long start = System.currentTimeMillis();
if(newBookIndexList.size() > 0) {
//删除已存在的错误章节
List<Integer> indexNumberList = newBookIndexList.stream().map(BookIndex::getIndexNum).collect(Collectors.toList());
Long bookId = newBookIndexList.get(0).getBookId();
BookIndexExample bookIndexExample = new BookIndexExample();
bookIndexExample.createCriteria().andBookIdEqualTo(bookId).andIndexNumIn(indexNumberList);
bookIndexMapper.deleteByExample(bookIndexExample);
BookContentExample bookContentExample = new BookContentExample();
bookContentExample.createCriteria().andBookIdEqualTo(bookId).andIndexNumIn(indexNumberList);
bookContentMapper.deleteByExample(bookContentExample);
//插入新的章节
bookIndexMapper.insertBatch(newBookIndexList);
bookContentMapper.insertBatch(newContentList);
}
log.info("更新章节耗时:"+(System.currentTimeMillis()-start));
}
@ -307,7 +323,7 @@ public class BookService {
/**
* 查询该书籍已存在目录号
*/
public List<Integer> queryIndexNumByBookNameAndAuthor(String bookName, String author) {
public Map<Integer,BookIndex> queryIndexByBookNameAndAuthor(String bookName, String author) {
BookExample example = new BookExample();
example.createCriteria().andBookNameEqualTo(bookName).andAuthorEqualTo(author);
List<Book> books = bookMapper.selectByExample(example);
@ -317,13 +333,13 @@ public class BookService {
BookIndexExample bookIndexExample = new BookIndexExample();
bookIndexExample.createCriteria().andBookIdEqualTo(bookId);
List<BookIndex> bookIndices = bookIndexMapper.selectByExample(bookIndexExample);
if(bookIndices.size()>0) {
return bookIndices.stream().map(BookIndex::getIndexNum).collect(Collectors.toList());
if(bookIndices.size() > 0) {
return bookIndices.stream().collect(Collectors.toMap(BookIndex::getIndexNum, Function.identity()));
}
}
return new ArrayList<>(0);
return new HashMap<>(0);
}

View File

@ -45,6 +45,6 @@ dingdian:
cat-pattern: 类别:([^/]+)</li>
update-time-pattern: 更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a>
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
intro-pattern: class="review">([^<]+)</p>
intro-pattern: class="review">([^/]+)</p>
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>

View File

@ -9,6 +9,6 @@ mybatis:
mysql: {charset: utf8mb4}
books: {lowestScore: '8.5'}
crawl:
website: {type: '3'}
website: {type: '2'}
soft-novel: '0'
manhua: '0'