爬虫代码重构,增加可维护性

This commit is contained in:
xiongxiaoyang 2019-12-12 11:37:21 +08:00
parent be3cf1bb91
commit 64f6dc393e
15 changed files with 644 additions and 623 deletions

View File

@ -19,6 +19,7 @@ import xyz.zinglizingli.common.constant.CacheKeyConstans;
import xyz.zinglizingli.common.enums.PicSaveType;
import xyz.zinglizingli.books.mapper.*;
import xyz.zinglizingli.books.po.*;
import xyz.zinglizingli.common.utils.Constants;
import xyz.zinglizingli.common.utils.UUIDUtils;
import xyz.zinglizingli.common.cache.CommonCacheUtil;
import xyz.zinglizingli.common.utils.RestTemplateUtil;
@ -96,7 +97,7 @@ public class BookService {
List<BookContent> newContentList = new ArrayList<>();
for (int i = 0; i < bookIndex.size(); i++) {
BookContent bookContentItem = bookContent.get(i);
if (!bookContentItem.getContent().contains("正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新")) {
if (!bookContentItem.getContent().contains(Constants.NO_CONTENT_DESC)) {
BookIndex bookIndexItem = bookIndex.get(i);
bookIndexItem.setBookId(bookId);
bookContentItem.setBookId(bookId);
@ -207,57 +208,7 @@ public class BookService {
}
/**
* 获取分类名
* */
public String getCatNameById(Integer catid) {
String catName = "其他";
switch (catid) {
case 1: {
catName = "玄幻奇幻";
break;
}
case 2: {
catName = "武侠仙侠";
break;
}
case 3: {
catName = "都市言情";
break;
}
case 4: {
catName = "历史军事";
break;
}
case 5: {
catName = "科幻灵异";
break;
}
case 6: {
catName = "网游竞技";
break;
}
case 7: {
catName = "女生频道";
break;
}
case 8: {
catName = "轻小说";
break;
}
case 9: {
catName = "漫画";
break;
}
default: {
break;
}
}
return catName;
}
/**
* 查询书籍的基础数据
@ -374,87 +325,6 @@ public class BookService {
/**
* 查询轻小说分类名
* */
public String getSoftCatNameById(Integer softCat) {
String catName = "其他";
switch (softCat) {
case 21: {
catName = "魔幻";
break;
}
case 22: {
catName = "玄幻";
break;
}
case 23: {
catName = "古风";
break;
}
case 24: {
catName = "科幻";
break;
}
case 25: {
catName = "校园";
break;
}
case 26: {
catName = "都市";
break;
}
case 27: {
catName = "游戏";
break;
}
case 28: {
catName = "同人";
break;
}
case 29: {
catName = "悬疑";
break;
}
case 0: {
catName = "动漫";
break;
}
default: {
break;
}
}
return catName;
}
/**
* 查询漫画分类名
* */
public String getMhCatNameById(Integer softCat) {
String catName = "其他";
switch (softCat) {
case 3262: {
catName = "少年漫";
break;
}
case 3263: {
catName = "少女漫";
break;
}
default: {
break;
}
}
return catName;
}
/**
* 保存弹幕

View File

@ -15,6 +15,8 @@ import xyz.zinglizingli.books.po.BookIndex;
import xyz.zinglizingli.books.service.BookService;
import xyz.zinglizingli.books.vo.BookVO;
import xyz.zinglizingli.common.cache.CommonCacheUtil;
import xyz.zinglizingli.common.utils.CatUtil;
import xyz.zinglizingli.common.utils.Constants;
import java.util.*;
@ -71,7 +73,7 @@ public class ApiBookController {
String userId = null;
String titleType = "最近更新";
if (catId != null) {
titleType = bookService.getCatNameById(catId);
titleType = CatUtil.getCatNameById(catId);
} else if (keyword != null) {
titleType = "搜索";
} else if ("score".equals(sortBy)) {
@ -90,7 +92,7 @@ public class ApiBookController {
for (Book book : books) {
BookVO bookvo = new BookVO();
BeanUtils.copyProperties(book, bookvo);
bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid()));
bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid()));
bookVOList.add(bookvo);
}
@ -103,7 +105,7 @@ public class ApiBookController {
int index = idsArr.indexOf(book.getId() + "");
BookVO bookvo = new BookVO();
BeanUtils.copyProperties(book, bookvo);
bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid()));
bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid()));
bookVOArr[length - index - 1] = bookvo;
}
bookVOList = Arrays.asList(bookVOArr);
@ -139,7 +141,7 @@ public class ApiBookController {
BookVO bookvo = new BookVO();
BeanUtils.copyProperties(book, bookvo);
bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid()));
bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid()));
modelMap.put("bookId", bookId);
modelMap.put("book", bookvo);
modelMap.put("indexList", indexList);
@ -185,7 +187,7 @@ public class ApiBookController {
bookContent.setId(-1L);
bookContent.setBookId(bookId);
bookContent.setIndexNum(indexNum);
bookContent.setContent("正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新");
bookContent.setContent(Constants.NO_CONTENT_DESC);
indexName="";
}else{
indexName = bookService.queryIndexNameByBookIdAndIndexNum(bookId, indexNum);

View File

@ -20,6 +20,7 @@ import xyz.zinglizingli.books.service.BookService;
import xyz.zinglizingli.books.service.UserService;
import xyz.zinglizingli.books.vo.BookVO;
import xyz.zinglizingli.common.cache.CommonCacheUtil;
import xyz.zinglizingli.common.utils.CatUtil;
import xyz.zinglizingli.common.utils.Constants;
import javax.servlet.http.HttpServletResponse;
@ -62,7 +63,7 @@ public class BookController {
String userId = null;
String titleType = "最近更新";
if (catId != null) {
titleType = bookService.getCatNameById(catId) + "分类频道";
titleType = CatUtil.getCatNameById(catId) + "分类频道";
} else if (Constants.NOVEL_TOP_FIELD.equals(sortBy)) {
titleType = "小说排行";
} else if (ids != null) {
@ -84,7 +85,7 @@ public class BookController {
for (Book book : books) {
BookVO bookvo = new BookVO();
BeanUtils.copyProperties(book, bookvo);
bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid()));
bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid()));
bookVoList.add(bookvo);
}
@ -97,7 +98,7 @@ public class BookController {
int index = idsArr.indexOf(book.getId() + "");
BookVO bookvo = new BookVO();
BeanUtils.copyProperties(book, bookvo);
bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid()));
bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid()));
bookVoArr[books.size() - index - 1] = bookvo;
}
bookVoList = Arrays.asList(bookVoArr);
@ -144,10 +145,10 @@ public class BookController {
BeanUtils.copyProperties(book, bookvo);
if(catId == Constants.SOFT_NOVEL_CAT) {
//轻小说
bookvo.setCateName(bookService.getSoftCatNameById(bookvo.getSoftCat()));
bookvo.setCateName(CatUtil.getSoftCatNameById(bookvo.getSoftCat()));
}else if(catId == Constants.MH_NOVEL_CAT){
//漫画
bookvo.setCateName(bookService.getMhCatNameById(bookvo.getSoftCat()));
bookvo.setCateName(CatUtil.getMhCatNameById(bookvo.getSoftCat()));
}
bookVoList.add(bookvo);
}
@ -204,7 +205,7 @@ public class BookController {
BookVO bookvo = new BookVO();
BeanUtils.copyProperties(book, bookvo);
bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid()));
bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid()));
modelMap.put("bookId", bookId);
modelMap.put("book", bookvo);
@ -243,7 +244,7 @@ public class BookController {
bookContent.setId(-1L);
bookContent.setBookId(bookId);
bookContent.setIndexNum(indexNum);
bookContent.setContent("正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新");
bookContent.setContent(Constants.NO_CONTENT_DESC);
indexName = "更新中。。。";
} else {
indexName = bookService.queryIndexNameByBookIdAndIndexNum(bookId, indexNum);

View File

@ -0,0 +1,27 @@
package xyz.zinglizingli.common.config;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import xyz.zinglizingli.common.crawl.BaseHtmlCrawlSource;
import xyz.zinglizingli.common.crawl.BiquCrawlSource;
/**
* @author 11797
*/
@Slf4j
@Configuration
public class CrawlBiqudaoConfig {
@Bean
@ConfigurationProperties(prefix = "biqudao.crawlsource") // prefix值必须是application.yml中对应属性的前缀
@ConditionalOnProperty(prefix = "biqudao.crawlsource",name = "enabled",havingValue = "true")
public BaseHtmlCrawlSource BiqutaCrawlSource() {
return new BiquCrawlSource();
}
}

View File

@ -0,0 +1,29 @@
package xyz.zinglizingli.common.config;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Primary;
import xyz.zinglizingli.common.crawl.BaseHtmlCrawlSource;
import xyz.zinglizingli.common.crawl.BiquCrawlSource;
/**
* @author 11797
*/
@Slf4j
@Configuration
public class CrawlBiqutaConfig {
@Bean
@Primary //必须加此注解不然报错下一个类则不需要添加
@ConfigurationProperties(prefix = "biquta.crawlsource") // prefix值必须是application.yml中对应属性的前缀
@ConditionalOnProperty(prefix = "biquta.crawlsource",name = "enabled",havingValue = "true")
public BaseHtmlCrawlSource BiqutaCrawlSource() {
return new BiquCrawlSource();
}
}

View File

@ -0,0 +1,20 @@
package xyz.zinglizingli.common.crawl;
import lombok.Data;
import org.springframework.beans.factory.annotation.Value;
/**
* 爬虫源
* @author 11797
*/
@Data
public abstract class BaseCrawlSource {
@Value("${books.lowestScore}")
private Float lowestScore;
/**
* 解析数据
* */
public abstract void parse();
}

View File

@ -0,0 +1,81 @@
package xyz.zinglizingli.common.crawl;
import lombok.Data;
/**
* html爬虫源
* @author 11797
*/
@Data
public abstract class BaseHtmlCrawlSource extends BaseCrawlSource{
/**
* 首页url
* */
private String indexUrl;
/**
* 列表页url
* */
private String listPageUrl;
/**
* 书籍url Pattern
* */
private String bookUrlPattern;
/**
* 评分 Pattern
* */
private String scorePattern;
/**
* 书名 Pattern
* */
private String bookNamePattern;
/**
* 作者 Pattern
* */
private String authorPattern;
/**
* 状态 Pattern
* */
private String statusPattern;
/**
* 类别 Pattern
* */
private String catPattern;
/**
* 更新时间 Pattern
* */
private String updateTimePattern;
/**
* 封面 Pattern
* */
private String picPattern;
/**
* 简介 Pattern
* */
private String introPattern;
/**
* 完整目录页url Pattern
* */
private String catalogUrlPattern;
/**
* 目录 Pattern
* */
private String catalogPattern;
}

View File

@ -0,0 +1,8 @@
package xyz.zinglizingli.common.crawl;
/**
* Json爬虫源
* @author 11797
*/
public abstract class BaseJsonCrawlSource extends BaseCrawlSource{
}

View File

@ -0,0 +1,216 @@
package xyz.zinglizingli.common.crawl;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import xyz.zinglizingli.books.po.Book;
import xyz.zinglizingli.books.po.BookContent;
import xyz.zinglizingli.books.po.BookIndex;
import xyz.zinglizingli.books.service.BookService;
import xyz.zinglizingli.common.utils.CatUtil;
import xyz.zinglizingli.common.utils.ExcutorUtils;
import xyz.zinglizingli.common.utils.RestTemplateUtil;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.regex.Pattern.compile;
/**
* @author 11797
*/
@Slf4j
public class BiquCrawlSource extends BaseHtmlCrawlSource {
@Autowired
private BookService bookService;
@Override
public void parse() {
String catBookListUrl = getListPageUrl().replace("{0}", "0").replace("{1}", "1");
String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl);
if (forObject != null) {
//解析第一页书籍的数据
Pattern bookPatten = compile(getBookUrlPattern());
Matcher bookMatcher = bookPatten.matcher(forObject);
boolean isFind = bookMatcher.find();
Pattern scorePatten = compile(getScorePattern());
Matcher scoreMatch = scorePatten.matcher(forObject);
boolean scoreFind = scoreMatch.find();
Pattern bookNamePatten = compile(getBookNamePattern());
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
boolean isBookNameMatch = bookNameMatch.find();
while (isFind && scoreFind && isBookNameMatch) {
try {
Float score = Float.parseFloat(scoreMatch.group(1));
if (score < getLowestScore()) {
continue;
}
String bokNum = bookMatcher.group(1);
String bookUrl = getIndexUrl() + "/" + bokNum + "/";
String body = RestTemplateUtil.getBodyByUtf8(bookUrl);
if (body != null) {
String bookName = bookNameMatch.group(1);
Pattern authorPatten = compile(getAuthorPattern());
Matcher authoreMatch = authorPatten.matcher(body);
if (authoreMatch.find()) {
String author = authoreMatch.group(1);
Pattern statusPatten = compile(getStatusPattern());
Matcher statusMatch = statusPatten.matcher(body);
if (statusMatch.find()) {
String status = statusMatch.group(1);
Pattern catPatten = compile(getCatPattern());
Matcher catMatch = catPatten.matcher(body);
if (catMatch.find()) {
String catName = catMatch.group(1);
int catNum = CatUtil.getCatNum(catName);
Pattern updateTimePatten = compile(getUpdateTimePattern());
Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
Pattern picPatten = compile(getPicPattern());
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
String picSrc = picMather.group(1);
Pattern descPatten = compile(getIntroPattern());
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List<BookIndex> indexList = new ArrayList<>();
List<BookContent> contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = compile(getCatalogUrlPattern());
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = getIndexUrl() + indexMatch.group(1);
String body2 = RestTemplateUtil.getBodyByUtf8(indexUrl);
if (body2 != null) {
Pattern indexListPatten = compile(getCatalogPattern());
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List<Integer> hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author);
//更新和插入分别开插入只在凌晨做一次
if (hasIndexNum.size() > 0) {
while (isFindIndex) {
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = getIndexUrl() + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = RestTemplateUtil.getBodyByUtf8(contentUrl);
if (body3 != null) {
String start = "『章节错误,点此举报』";
String end = "『加入书签,方便阅读』";
String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end));
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
ExcutorUtils.excuteFixedTask(() ->
bookService.saveBookAndIndexAndContent(book, indexList, contentList)
);
}
}
}
}
}
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
bookMatcher.find();
isFind = bookMatcher.find();
scoreFind = scoreMatch.find();
isBookNameMatch = bookNameMatch.find();
}
}
}
}
}

View File

@ -3,28 +3,13 @@ package xyz.zinglizingli.common.schedule;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.Charsets;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestTemplate;
import xyz.zinglizingli.books.po.Book;
import xyz.zinglizingli.books.po.BookContent;
import xyz.zinglizingli.books.po.BookIndex;
import xyz.zinglizingli.books.service.BookService;
import xyz.zinglizingli.common.utils.ExcutorUtils;
import xyz.zinglizingli.common.crawl.BaseCrawlSource;
import xyz.zinglizingli.common.utils.RestTemplateUtil;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.regex.Pattern.compile;
/**
* 更新书籍章节内容定时任务
*
@ -35,19 +20,8 @@ import static java.util.regex.Pattern.compile;
@Slf4j
public class CrawlBooksSchedule {
private final BookService bookService;
private RestTemplate utf8RestTemplate = RestTemplateUtil.getInstance(Charsets.UTF_8);
@Value("${books.lowestScore}")
private Float lowestScore;
@Value("${crawl.website.type}")
private Byte websiteType;
@Value("${pic.save.path}")
private String picSavePath;
private final BaseCrawlSource crawlSource;
/**
@ -58,459 +32,11 @@ public class CrawlBooksSchedule {
log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。");
switch (websiteType) {
case 1: {
updateBiqudaoBooks(0);
break;
}
case 2: {
updateBiquTaBooks(0);
break;
}
default: {
break;
}
}
}
/**
* 从笔趣塔更新
*/
private void updateBiquTaBooks(int bookClass) {
String baseUrl = "https://m.biquta.la";
String catBookListUrlBase = baseUrl + "/class/";
String catBookListUrl = catBookListUrlBase + bookClass + "/" + 1 + ".html";
String forObject = getByRestTemplate(catBookListUrl);
if (forObject != null) {
Pattern pattern = compile("value=\"(\\d+)/(\\d+)\"");
Matcher matcher = pattern.matcher(forObject);
boolean isFind = matcher.find();
if (isFind) {
//解析第一页书籍的数据
Pattern bookPatten = compile("href=\"/(\\d+_\\d+)/\"");
parseBiquTaBook(bookPatten, forObject, baseUrl);
}
}
}
/**
* 解析笔趣塔数据
*/
private void parseBiquTaBook(Pattern bookPatten, String forObject, String baseUrl) {
Matcher bookMatcher = bookPatten.matcher(forObject);
boolean isFind = bookMatcher.find();
Pattern scorePatten = compile("<div\\s+class=\"score\">(\\d+\\.\\d+)分</div>");
Matcher scoreMatch = scorePatten.matcher(forObject);
boolean scoreFind = scoreMatch.find();
Pattern bookNamePatten = compile("<p class=\"title\">([^/]+)</p>");
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
boolean isBookNameMatch = bookNameMatch.find();
while (isFind && scoreFind && isBookNameMatch) {
try {
Float score = Float.parseFloat(scoreMatch.group(1));
if (score < lowestScore) {
continue;
}
String bokNum = bookMatcher.group(1);
String bookUrl = baseUrl + "/" + bokNum + "/";
String body = getByRestTemplate(bookUrl);
if (body != null) {
String bookName = bookNameMatch.group(1);
Pattern authorPatten = compile(">作者:([^/]+)<");
Matcher authoreMatch = authorPatten.matcher(body);
if (authoreMatch.find()) {
String author = authoreMatch.group(1);
Pattern statusPatten = compile("状态:([^/]+)</li>");
Matcher statusMatch = statusPatten.matcher(body);
if (statusMatch.find()) {
String status = statusMatch.group(1);
Pattern catPatten = compile("类别:([^/]+)</li>");
Matcher catMatch = catPatten.matcher(body);
if (catMatch.find()) {
String catName = catMatch.group(1);
int catNum = getCatNum(catName);
Pattern updateTimePatten = compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)</a>");
Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
Pattern picPatten = compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
String picSrc = picMather.group(1);
Pattern descPatten = compile("class=\"review\">([^<]+)</p>");
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List<BookIndex> indexList = new ArrayList<>();
List<BookContent> contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = compile("<a\\s+href=\"(/du/\\d+_\\d+/)\">查看完整目录</a>");
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = baseUrl + indexMatch.group(1);
String body2 = getByRestTemplate(indexUrl);
if (body2 != null) {
Pattern indexListPatten = compile("<a\\s+style=\"\"\\s+href=\"(/\\d+_\\d+/\\d+\\.html)\">([^/]+)</a>");
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List<Integer> hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author);
//更新和插入分别开插入只在凌晨做一次
if (hasIndexNum.size() > 0) {
while (isFindIndex) {
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = baseUrl + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = getByRestTemplate(contentUrl);
if (body3 != null) {
String start = "『章节错误,点此举报』";
String end = "『加入书签,方便阅读』";
String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end));
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
ExcutorUtils.excuteFixedTask(() ->
bookService.saveBookAndIndexAndContent(book, indexList, contentList)
);
}
}
}
crawlSource.parse();
}
}
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
bookMatcher.find();
isFind = bookMatcher.find();
scoreFind = scoreMatch.find();
isBookNameMatch = bookNameMatch.find();
}
}
}
/**
* 从笔趣岛更新
*/
private void updateBiqudaoBooks(int bookClass) {
String baseUrl = "https://m.biqudao.com";
String catBookListUrlBase = baseUrl + "/bqgeclass/";
int page = 1;
String catBookListUrl = catBookListUrlBase + bookClass + "/" + page + ".html";
String forObject = getByRestTemplate(catBookListUrl);
if (forObject != null) {
Pattern pattern = compile("value=\"(\\d+)/(\\d+)\"");
Matcher matcher = pattern.matcher(forObject);
boolean isFind = matcher.find();
if (isFind) {
//解析第一页书籍的数据
Pattern bookPatten = compile("href=\"/(bqge\\d+)/\"");
parseBiquDaoBook(bookPatten, forObject, baseUrl);
}
}
}
/**
* 解析笔趣岛数据
*/
private void parseBiquDaoBook(Pattern bookPatten, String forObject, String baseUrl) {
Matcher bookMatcher = bookPatten.matcher(forObject);
boolean isFind = bookMatcher.find();
Pattern scorePatten = compile("<div\\s+class=\"score\">(\\d+\\.\\d+)分</div>");
Matcher scoreMatch = scorePatten.matcher(forObject);
boolean scoreFind = scoreMatch.find();
Pattern bookNamePatten = compile("<p class=\"title\">([^/]+)</p>");
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
boolean isBookNameMatch = bookNameMatch.find();
while (isFind && scoreFind && isBookNameMatch) {
try {
Float score = Float.parseFloat(scoreMatch.group(1));
if (score < lowestScore) {
continue;
}
String bokNum = bookMatcher.group(1);
String bookUrl = baseUrl + "/" + bokNum + "/";
String body = getByRestTemplate(bookUrl);
if (body != null) {
String bookName = bookNameMatch.group(1);
Pattern authorPatten = compile("<li class=\"author\">作者:([^/]+)</li>");
Matcher authoreMatch = authorPatten.matcher(body);
if (authoreMatch.find()) {
String author = authoreMatch.group(1);
Pattern statusPatten = compile("状态:([^/]+)</li>");
Matcher statusMatch = statusPatten.matcher(body);
if (statusMatch.find()) {
String status = statusMatch.group(1);
Pattern catPatten = compile("类别:([^/]+)</li>");
Matcher catMatch = catPatten.matcher(body);
if (catMatch.find()) {
String catName = catMatch.group(1);
int catNum = getCatNum(catName);
Pattern updateTimePatten = compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)</a>");
Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
Pattern picPatten = compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
String picSrc = picMather.group(1);
Pattern descPatten = compile("class=\"review\">([^<]+)</p>");
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List<BookIndex> indexList = new ArrayList<>();
List<BookContent> contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = compile("<a\\s+href=\"(/bqge\\d+/all\\.html)\">查看完整目录</a>");
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = baseUrl + indexMatch.group(1);
String body2 = getByRestTemplate(indexUrl);
if (body2 != null) {
Pattern indexListPatten = compile("<a[^/]+style[^/]+href=\"(/bqge\\d+/\\d+\\.html)\">([^/]+)</a>");
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List<Integer> hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author);
//只更新已存在的书籍
if (hasIndexNum.size() > 0) {
while (isFindIndex) {
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = baseUrl + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = getByRestTemplate(contentUrl);
if (body3 != null) {
String start = "『章节错误,点此举报』";
String end = "『加入书签,方便阅读』";
String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end));
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
ExcutorUtils.excuteFixedTask(() -> bookService.saveBookAndIndexAndContent(book, indexList, contentList));
}
}
}
}
}
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
bookMatcher.find();
isFind = bookMatcher.find();
scoreFind = scoreMatch.find();
isBookNameMatch = bookNameMatch.find();
}
}
}
private int getCatNum(String catName) {
int catNum;
switch (catName) {
case "武侠仙侠": {
catNum = 2;
break;
}
case "都市言情": {
catNum = 3;
break;
}
case "历史军事": {
catNum = 4;
break;
}
case "科幻灵异": {
catNum = 5;
break;
}
case "网游竞技": {
catNum = 6;
break;
}
case "女生频道": {
catNum = 7;
break;
}
default: {
catNum = 1;
break;
}
}
return catNum;
}
private String getByRestTemplate(String url) {
try {
ResponseEntity<String> forEntity = utf8RestTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
} else {
return null;
}
} catch (Exception e) {
log.error(e.getMessage(), e);
return null;
}
}
}

View File

@ -0,0 +1,179 @@
package xyz.zinglizingli.common.utils;
/**
* @author 11797
*/
public class CatUtil {
public static int getCatNum(String catName) {
int catNum;
switch (catName) {
case "武侠仙侠": {
catNum = 2;
break;
}
case "都市言情": {
catNum = 3;
break;
}
case "历史军事": {
catNum = 4;
break;
}
case "科幻灵异": {
catNum = 5;
break;
}
case "网游竞技": {
catNum = 6;
break;
}
case "女生频道": {
catNum = 7;
break;
}
default: {
catNum = 1;
break;
}
}
return catNum;
}
/**
* 查询轻小说分类名
* */
public static String getSoftCatNameById(Integer softCat) {
String catName = "其他";
switch (softCat) {
case 21: {
catName = "魔幻";
break;
}
case 22: {
catName = "玄幻";
break;
}
case 23: {
catName = "古风";
break;
}
case 24: {
catName = "科幻";
break;
}
case 25: {
catName = "校园";
break;
}
case 26: {
catName = "都市";
break;
}
case 27: {
catName = "游戏";
break;
}
case 28: {
catName = "同人";
break;
}
case 29: {
catName = "悬疑";
break;
}
case 0: {
catName = "动漫";
break;
}
default: {
break;
}
}
return catName;
}
/**
* 查询漫画分类名
* */
public static String getMhCatNameById(Integer softCat) {
String catName = "其他";
switch (softCat) {
case 3262: {
catName = "少年漫";
break;
}
case 3263: {
catName = "少女漫";
break;
}
default: {
break;
}
}
return catName;
}
/**
* 获取分类名
* */
public static String getCatNameById(Integer catid) {
String catName = "其他";
switch (catid) {
case 1: {
catName = "玄幻奇幻";
break;
}
case 2: {
catName = "武侠仙侠";
break;
}
case 3: {
catName = "都市言情";
break;
}
case 4: {
catName = "历史军事";
break;
}
case 5: {
catName = "科幻灵异";
break;
}
case 6: {
catName = "网游竞技";
break;
}
case 7: {
catName = "女生频道";
break;
}
case 8: {
catName = "轻小说";
break;
}
case 9: {
catName = "漫画";
break;
}
default: {
break;
}
}
return catName;
}
}

View File

@ -85,4 +85,9 @@ public class Constants {
* 多本书籍ID分隔符
* */
public static final String BOOK_ID_SEPARATOR = "-";
/**
* 没有内容的描述
* */
public static final String NO_CONTENT_DESC = "正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新";
}

View File

@ -1,5 +1,9 @@
package xyz.zinglizingli.common.utils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.Charsets;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
import org.springframework.http.converter.HttpMessageConverter;
import org.springframework.http.converter.StringHttpMessageConverter;
@ -10,6 +14,10 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author 11797
*/
@Slf4j
public class RestTemplateUtil {
private static Map<String,RestTemplate> restTemplateMap = new HashMap<>();
@ -35,4 +43,18 @@ public class RestTemplateUtil {
return restTemplate;
}
public static String getBodyByUtf8(String url) {
try {
ResponseEntity<String> forEntity = getInstance(Charsets.UTF_8).getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
} else {
return null;
}
} catch (Exception e) {
log.error(e.getMessage(), e);
return null;
}
}
}

View File

@ -0,0 +1,33 @@
#爬取的网站名称类型 1笔趣岛 2笔趣塔 更多网站解析中,敬请期待
biquta:
crawlsource:
enabled: true #是否开启此爬虫源
index-url: https://m.biquta.la
list-page-url: https://m.biquta.la/class/{0}/{1}.html
book-url-pattern: href="/(\d+_\d+)/"
score-pattern: <div\s+class="score">(\d+\.\d+)分</div>
book-name-pattern: <p class="title">([^/]+)</p>
author-pattern: 作者:([^/]+)<
status-pattern: 状态:([^/]+)</li>
cat-pattern: 类别:([^/]+)</li>
update-time-pattern: 更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a>
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
intro-pattern: class="review">([^<]+)</p>
catalog-url-pattern: <a\s+href="(/du/\d+_\d+/)">查看完整目录</a>
catalog-pattern: <a\s+style=""\s+href="(/\d+_\d+/\d+\.html)">([^/]+)</a>
biqudao:
crawlsource:
enabled: true #是否开启此爬虫源
index-url: https://m.biqudao.com
list-page-url: https://m.biqudao.com/bqgeclass/{0}/{1}.html
book-url-pattern: href="/(bqge\d+)/"
score-pattern: <div\s+class="score">(\d+\.\d+)分</div>
book-name-pattern: <p class="title">([^/]+)</p>
author-pattern: <li class="author">作者:([^/]+)</li>
status-pattern: 状态:([^/]+)</li>
cat-pattern: 类别:([^/]+)</li>
update-time-pattern: 更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a>
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
intro-pattern: class="review">([^<]+)</p>
catalog-url-pattern: <a\s+href="(/bqge\d+/all\.html)">查看完整目录</a>
catalog-pattern: <a[^/]+style[^/]+href="(/bqge\d+/\d+\.html)">([^/]+)</a>

View File

@ -4,8 +4,8 @@ server:
spring:
datasource:
url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
username: books
password: books
username: root
password: test123456
# url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
# username: root
# password: test123456
@ -39,6 +39,8 @@ spring:
port: 465
class: javax.net.ssl.SSLSocketFactory
fallback: false
profiles:
include: crawl