mirror of
https://github.com/201206030/novel.git
synced 2025-04-27 07:30:50 +00:00
爬虫自动更新程序优化,增加自动修复错误章节
This commit is contained in:
parent
8f5f141316
commit
eac0ce9302
@ -522,10 +522,8 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
|||||||
if (picMather.find()) {
|
if (picMather.find()) {
|
||||||
String picSrc = picMather.group(1);
|
String picSrc = picMather.group(1);
|
||||||
|
|
||||||
Pattern descPatten = compile("class=\"review\">([^<]+)</p>");
|
String desc = body.substring(body.indexOf("<p class=\"review\">") + "<p class=\"review\">".length());
|
||||||
Matcher descMatch = descPatten.matcher(body);
|
desc = desc.substring(0, desc.indexOf("</p>"));
|
||||||
if (descMatch.find()) {
|
|
||||||
String desc = descMatch.group(1);
|
|
||||||
|
|
||||||
|
|
||||||
BookDO book = new BookDO();
|
BookDO book = new BookDO();
|
||||||
@ -569,12 +567,12 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
|||||||
|
|
||||||
|
|
||||||
//查询章节内容
|
//查询章节内容
|
||||||
String body3 = getByTemplate(contentUrl.replace("//m.","//www."));
|
String body3 = getByTemplate(contentUrl.replace("//m.", "//www."));
|
||||||
if (body3 != null) {
|
if (body3 != null) {
|
||||||
String start = "id=\"content\">";
|
String start = "id=\"content\">";
|
||||||
String end = "<script>";
|
String end = "<script>";
|
||||||
String content = body3.substring(body3.indexOf(start) + start.length());
|
String content = body3.substring(body3.indexOf(start) + start.length());
|
||||||
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">"+content.substring(0,content.indexOf(end))+"</div>";
|
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">" + content.substring(0, content.indexOf(end)) + "</div>";
|
||||||
//TODO插入章节目录和章节内容
|
//TODO插入章节目录和章节内容
|
||||||
BookIndexDO bookIndex = new BookIndexDO();
|
BookIndexDO bookIndex = new BookIndexDO();
|
||||||
bookIndex.setIndexName(indexName);
|
bookIndex.setIndexName(indexName);
|
||||||
@ -604,7 +602,6 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
|||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -857,9 +854,6 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private String getByTemplate(String catBookListUrl) {
|
private String getByTemplate(String catBookListUrl) {
|
||||||
try {
|
try {
|
||||||
ResponseEntity<String> forEntity = restTemplate.getForEntity(catBookListUrl, String.class);
|
ResponseEntity<String> forEntity = restTemplate.getForEntity(catBookListUrl, String.class);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package xyz.zinglizingli.books.core.crawl;
|
package xyz.zinglizingli.books.core.crawl;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import xyz.zinglizingli.books.po.Book;
|
import xyz.zinglizingli.books.po.Book;
|
||||||
import xyz.zinglizingli.books.po.BookContent;
|
import xyz.zinglizingli.books.po.BookContent;
|
||||||
@ -14,6 +15,7 @@ import java.text.SimpleDateFormat;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -94,12 +96,8 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
Matcher picMather = picPatten.matcher(body);
|
Matcher picMather = picPatten.matcher(body);
|
||||||
if (picMather.find()) {
|
if (picMather.find()) {
|
||||||
String picSrc = picMather.group(1);
|
String picSrc = picMather.group(1);
|
||||||
|
String desc = body.substring(body.indexOf("<p class=\"review\">") + "<p class=\"review\">".length());
|
||||||
|
desc = desc.substring(0, desc.indexOf("</p>"));
|
||||||
Pattern descPatten = compile(getIntroPattern());
|
|
||||||
Matcher descMatch = descPatten.matcher(body);
|
|
||||||
if (descMatch.find()) {
|
|
||||||
String desc = descMatch.group(1);
|
|
||||||
|
|
||||||
|
|
||||||
Book book = new Book();
|
Book book = new Book();
|
||||||
@ -130,23 +128,23 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
int indexNum = 0;
|
int indexNum = 0;
|
||||||
|
|
||||||
//查询该书籍已存在目录号
|
//查询该书籍已存在目录号
|
||||||
List<Integer> hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author);
|
Map<Integer, BookIndex> hasIndexs = bookService.queryIndexByBookNameAndAuthor(bookName, author);
|
||||||
//更新和插入分别开,插入只在凌晨做一次
|
//更新和插入分别开,此处只做更新
|
||||||
if (hasIndexNum.size() > 0) {
|
if (hasIndexs.size() > 0) {
|
||||||
while (isFindIndex) {
|
while (isFindIndex) {
|
||||||
if (!hasIndexNum.contains(indexNum)) {
|
BookIndex hasIndex = hasIndexs.get(indexNum);
|
||||||
|
|
||||||
String contentUrl = getIndexUrl() + indexListMatch.group(1);
|
|
||||||
String indexName = indexListMatch.group(2);
|
String indexName = indexListMatch.group(2);
|
||||||
|
|
||||||
|
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) {
|
||||||
|
String contentUrl = getIndexUrl() + indexListMatch.group(1);
|
||||||
|
|
||||||
//查询章节内容
|
//查询章节内容
|
||||||
String body3 = RestTemplateUtil.getBodyByUtf8(contentUrl.replace("//m.","//www.").replace("//wap.","//www."));
|
String body3 = RestTemplateUtil.getBodyByUtf8(contentUrl.replace("//m.", "//www.").replace("//wap.", "//www."));
|
||||||
if (body3 != null) {
|
if (body3 != null) {
|
||||||
String start = "id=\"content\">";
|
String start = "id=\"content\">";
|
||||||
String end = "<script>";
|
String end = "<script>";
|
||||||
String content = body3.substring(body3.indexOf(start) + start.length());
|
String content = body3.substring(body3.indexOf(start) + start.length());
|
||||||
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">"+content.substring(0,content.indexOf(end))+"</div>";
|
content = "<div class=\"article-content font16\" id=\"ChapterBody\" data-class=\"font16\">" + content.substring(0, content.indexOf(end)) + "</div>";
|
||||||
//TODO插入章节目录和章节内容
|
//TODO插入章节目录和章节内容
|
||||||
BookIndex bookIndex = new BookIndex();
|
BookIndex bookIndex = new BookIndex();
|
||||||
bookIndex.setIndexName(indexName);
|
bookIndex.setIndexName(indexName);
|
||||||
@ -169,9 +167,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (indexList.size() == contentList.size() && indexList.size() > 0) {
|
if (indexList.size() == contentList.size() && indexList.size() > 0) {
|
||||||
ExcutorUtils.excuteFixedTask(() ->
|
bookService.saveBookAndIndexAndContent(book, indexList, contentList);
|
||||||
bookService.saveBookAndIndexAndContent(book, indexList, contentList)
|
|
||||||
);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -186,7 +182,6 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -212,5 +207,4 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,7 @@ import java.io.FileOutputStream;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -109,13 +110,13 @@ public class BookService {
|
|||||||
newBookIndexList.add(bookIndexItem);
|
newBookIndexList.add(bookIndexItem);
|
||||||
newContentList.add(bookContentItem);
|
newContentList.add(bookContentItem);
|
||||||
}
|
}
|
||||||
//一次最多只允许插入20条记录,否则影响服务器响应
|
//一次最多只允许插入100条记录,否则影响服务器响应
|
||||||
if (isUpdate && i % 20 == 0 && newBookIndexList.size() > 0) {
|
if (isUpdate && i % 100 == 0 && newBookIndexList.size() > 0) {
|
||||||
bookService.insertIndexListAndContentList(newBookIndexList, newContentList);
|
bookService.insertIndexListAndContentList(newBookIndexList, newContentList);
|
||||||
newBookIndexList = new ArrayList<>();
|
newBookIndexList = new ArrayList<>();
|
||||||
newContentList = new ArrayList<>();
|
newContentList = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
Thread.sleep(1000 * 60 * 5);
|
Thread.sleep(1000 * 60 * 1);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
throw new RuntimeException(e.getMessage());
|
throw new RuntimeException(e.getMessage());
|
||||||
@ -175,13 +176,28 @@ public class BookService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 批量插入章节目录表和章节内容表
|
* 批量插入章节目录表和章节内容表(自动修复错误章节)
|
||||||
* */
|
* */
|
||||||
@Transactional(rollbackFor = Exception.class)
|
@Transactional(rollbackFor = Exception.class)
|
||||||
public void insertIndexListAndContentList(List<BookIndex> newBookIndexList, List<BookContent> newContentList) {
|
public void insertIndexListAndContentList(List<BookIndex> newBookIndexList, List<BookContent> newContentList) {
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
if(newBookIndexList.size() > 0) {
|
||||||
|
//删除已存在的错误章节
|
||||||
|
List<Integer> indexNumberList = newBookIndexList.stream().map(BookIndex::getIndexNum).collect(Collectors.toList());
|
||||||
|
Long bookId = newBookIndexList.get(0).getBookId();
|
||||||
|
BookIndexExample bookIndexExample = new BookIndexExample();
|
||||||
|
bookIndexExample.createCriteria().andBookIdEqualTo(bookId).andIndexNumIn(indexNumberList);
|
||||||
|
bookIndexMapper.deleteByExample(bookIndexExample);
|
||||||
|
BookContentExample bookContentExample = new BookContentExample();
|
||||||
|
bookContentExample.createCriteria().andBookIdEqualTo(bookId).andIndexNumIn(indexNumberList);
|
||||||
|
bookContentMapper.deleteByExample(bookContentExample);
|
||||||
|
|
||||||
|
//插入新的章节
|
||||||
bookIndexMapper.insertBatch(newBookIndexList);
|
bookIndexMapper.insertBatch(newBookIndexList);
|
||||||
bookContentMapper.insertBatch(newContentList);
|
bookContentMapper.insertBatch(newContentList);
|
||||||
}
|
}
|
||||||
|
log.info("更新章节耗时:"+(System.currentTimeMillis()-start));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -307,7 +323,7 @@ public class BookService {
|
|||||||
/**
|
/**
|
||||||
* 查询该书籍已存在目录号
|
* 查询该书籍已存在目录号
|
||||||
*/
|
*/
|
||||||
public List<Integer> queryIndexNumByBookNameAndAuthor(String bookName, String author) {
|
public Map<Integer,BookIndex> queryIndexByBookNameAndAuthor(String bookName, String author) {
|
||||||
BookExample example = new BookExample();
|
BookExample example = new BookExample();
|
||||||
example.createCriteria().andBookNameEqualTo(bookName).andAuthorEqualTo(author);
|
example.createCriteria().andBookNameEqualTo(bookName).andAuthorEqualTo(author);
|
||||||
List<Book> books = bookMapper.selectByExample(example);
|
List<Book> books = bookMapper.selectByExample(example);
|
||||||
@ -317,13 +333,13 @@ public class BookService {
|
|||||||
BookIndexExample bookIndexExample = new BookIndexExample();
|
BookIndexExample bookIndexExample = new BookIndexExample();
|
||||||
bookIndexExample.createCriteria().andBookIdEqualTo(bookId);
|
bookIndexExample.createCriteria().andBookIdEqualTo(bookId);
|
||||||
List<BookIndex> bookIndices = bookIndexMapper.selectByExample(bookIndexExample);
|
List<BookIndex> bookIndices = bookIndexMapper.selectByExample(bookIndexExample);
|
||||||
if(bookIndices.size()>0) {
|
if(bookIndices.size() > 0) {
|
||||||
return bookIndices.stream().map(BookIndex::getIndexNum).collect(Collectors.toList());
|
return bookIndices.stream().collect(Collectors.toMap(BookIndex::getIndexNum, Function.identity()));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ArrayList<>(0);
|
return new HashMap<>(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -45,6 +45,6 @@ dingdian:
|
|||||||
cat-pattern: 类别:([^/]+)</li>
|
cat-pattern: 类别:([^/]+)</li>
|
||||||
update-time-pattern: 更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a>
|
update-time-pattern: 更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a>
|
||||||
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
|
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
|
||||||
intro-pattern: class="review">([^<]+)</p>
|
intro-pattern: class="review">([^/]+)</p>
|
||||||
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
|
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
|
||||||
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>
|
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>
|
@ -9,6 +9,6 @@ mybatis:
|
|||||||
mysql: {charset: utf8mb4}
|
mysql: {charset: utf8mb4}
|
||||||
books: {lowestScore: '8.5'}
|
books: {lowestScore: '8.5'}
|
||||||
crawl:
|
crawl:
|
||||||
website: {type: '3'}
|
website: {type: '2'}
|
||||||
soft-novel: '0'
|
soft-novel: '0'
|
||||||
manhua: '0'
|
manhua: '0'
|
||||||
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user