From 64f6dc393e786d5e1b8e11ebddedafca5bac4a6b Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <773861846@qq.com> Date: Thu, 12 Dec 2019 11:37:21 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E4=BB=A3=E7=A0=81=E9=87=8D?= =?UTF-8?q?=E6=9E=84=EF=BC=8C=E5=A2=9E=E5=8A=A0=E5=8F=AF=E7=BB=B4=E6=8A=A4?= =?UTF-8?q?=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../books/service/BookService.java | 134 +---- .../books/web/ApiBookController.java | 12 +- .../books/web/BookController.java | 15 +- .../common/config/CrawlBiqudaoConfig.java | 27 + .../common/config/CrawlBiqutaConfig.java | 29 ++ .../common/crawl/BaseCrawlSource.java | 20 + .../common/crawl/BaseHtmlCrawlSource.java | 81 +++ .../common/crawl/BaseJsonCrawlSource.java | 8 + .../common/crawl/BiquCrawlSource.java | 216 ++++++++ .../common/schedule/CrawlBooksSchedule.java | 480 +----------------- .../zinglizingli/common/utils/CatUtil.java | 179 +++++++ .../zinglizingli/common/utils/Constants.java | 5 + .../common/utils/RestTemplateUtil.java | 22 + .../src/main/resources/application-crawl.yml | 33 ++ .../src/main/resources/application.yml | 6 +- 15 files changed, 644 insertions(+), 623 deletions(-) create mode 100644 novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqudaoConfig.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqutaConfig.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseCrawlSource.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseHtmlCrawlSource.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseJsonCrawlSource.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/common/crawl/BiquCrawlSource.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/common/utils/CatUtil.java create mode 100644 novel-front/src/main/resources/application-crawl.yml diff --git a/novel-front/src/main/java/xyz/zinglizingli/books/service/BookService.java b/novel-front/src/main/java/xyz/zinglizingli/books/service/BookService.java index 586161c..64bcb46 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/books/service/BookService.java +++ b/novel-front/src/main/java/xyz/zinglizingli/books/service/BookService.java @@ -19,6 +19,7 @@ import xyz.zinglizingli.common.constant.CacheKeyConstans; import xyz.zinglizingli.common.enums.PicSaveType; import xyz.zinglizingli.books.mapper.*; import xyz.zinglizingli.books.po.*; +import xyz.zinglizingli.common.utils.Constants; import xyz.zinglizingli.common.utils.UUIDUtils; import xyz.zinglizingli.common.cache.CommonCacheUtil; import xyz.zinglizingli.common.utils.RestTemplateUtil; @@ -96,7 +97,7 @@ public class BookService { List newContentList = new ArrayList<>(); for (int i = 0; i < bookIndex.size(); i++) { BookContent bookContentItem = bookContent.get(i); - if (!bookContentItem.getContent().contains("正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新")) { + if (!bookContentItem.getContent().contains(Constants.NO_CONTENT_DESC)) { BookIndex bookIndexItem = bookIndex.get(i); bookIndexItem.setBookId(bookId); bookContentItem.setBookId(bookId); @@ -207,57 +208,7 @@ public class BookService { } - /** - * 获取分类名 - * */ - public String getCatNameById(Integer catid) { - String catName = "其他"; - switch (catid) { - case 1: { - catName = "玄幻奇幻"; - break; - } - case 2: { - catName = "武侠仙侠"; - break; - } - case 3: { - catName = "都市言情"; - break; - } - case 4: { - catName = "历史军事"; - break; - } - case 5: { - catName = "科幻灵异"; - break; - } - case 6: { - catName = "网游竞技"; - break; - } - case 7: { - catName = "女生频道"; - break; - } - case 8: { - catName = "轻小说"; - break; - } - case 9: { - catName = "漫画"; - break; - } - default: { - break; - } - - - } - return catName; - } /** * 查询书籍的基础数据 @@ -374,87 +325,6 @@ public class BookService { - /** - * 查询轻小说分类名 - * */ - public String getSoftCatNameById(Integer softCat) { - String catName = "其他"; - - switch (softCat) { - case 21: { - catName = "魔幻"; - break; - } - case 22: { - catName = "玄幻"; - break; - } - case 23: { - catName = "古风"; - break; - } - case 24: { - catName = "科幻"; - break; - } - case 25: { - catName = "校园"; - break; - } - case 26: { - catName = "都市"; - break; - } - case 27: { - catName = "游戏"; - break; - } - case 28: { - catName = "同人"; - break; - } - case 29: { - catName = "悬疑"; - break; - } - case 0: { - catName = "动漫"; - break; - } - default: { - break; - } - - - } - return catName; - - } - - /** - * 查询漫画分类名 - * */ - public String getMhCatNameById(Integer softCat) { - String catName = "其他"; - - switch (softCat) { - case 3262: { - catName = "少年漫"; - break; - } - case 3263: { - catName = "少女漫"; - break; - } - default: { - break; - } - - - } - return catName; - - } /** * 保存弹幕 diff --git a/novel-front/src/main/java/xyz/zinglizingli/books/web/ApiBookController.java b/novel-front/src/main/java/xyz/zinglizingli/books/web/ApiBookController.java index d25824c..2ed7664 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/books/web/ApiBookController.java +++ b/novel-front/src/main/java/xyz/zinglizingli/books/web/ApiBookController.java @@ -15,6 +15,8 @@ import xyz.zinglizingli.books.po.BookIndex; import xyz.zinglizingli.books.service.BookService; import xyz.zinglizingli.books.vo.BookVO; import xyz.zinglizingli.common.cache.CommonCacheUtil; +import xyz.zinglizingli.common.utils.CatUtil; +import xyz.zinglizingli.common.utils.Constants; import java.util.*; @@ -71,7 +73,7 @@ public class ApiBookController { String userId = null; String titleType = "最近更新"; if (catId != null) { - titleType = bookService.getCatNameById(catId); + titleType = CatUtil.getCatNameById(catId); } else if (keyword != null) { titleType = "搜索"; } else if ("score".equals(sortBy)) { @@ -90,7 +92,7 @@ public class ApiBookController { for (Book book : books) { BookVO bookvo = new BookVO(); BeanUtils.copyProperties(book, bookvo); - bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid())); + bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid())); bookVOList.add(bookvo); } @@ -103,7 +105,7 @@ public class ApiBookController { int index = idsArr.indexOf(book.getId() + ""); BookVO bookvo = new BookVO(); BeanUtils.copyProperties(book, bookvo); - bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid())); + bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid())); bookVOArr[length - index - 1] = bookvo; } bookVOList = Arrays.asList(bookVOArr); @@ -139,7 +141,7 @@ public class ApiBookController { BookVO bookvo = new BookVO(); BeanUtils.copyProperties(book, bookvo); - bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid())); + bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid())); modelMap.put("bookId", bookId); modelMap.put("book", bookvo); modelMap.put("indexList", indexList); @@ -185,7 +187,7 @@ public class ApiBookController { bookContent.setId(-1L); bookContent.setBookId(bookId); bookContent.setIndexNum(indexNum); - bookContent.setContent("正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新"); + bookContent.setContent(Constants.NO_CONTENT_DESC); indexName="?"; }else{ indexName = bookService.queryIndexNameByBookIdAndIndexNum(bookId, indexNum); diff --git a/novel-front/src/main/java/xyz/zinglizingli/books/web/BookController.java b/novel-front/src/main/java/xyz/zinglizingli/books/web/BookController.java index 2abdcb2..3233e20 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/books/web/BookController.java +++ b/novel-front/src/main/java/xyz/zinglizingli/books/web/BookController.java @@ -20,6 +20,7 @@ import xyz.zinglizingli.books.service.BookService; import xyz.zinglizingli.books.service.UserService; import xyz.zinglizingli.books.vo.BookVO; import xyz.zinglizingli.common.cache.CommonCacheUtil; +import xyz.zinglizingli.common.utils.CatUtil; import xyz.zinglizingli.common.utils.Constants; import javax.servlet.http.HttpServletResponse; @@ -62,7 +63,7 @@ public class BookController { String userId = null; String titleType = "最近更新"; if (catId != null) { - titleType = bookService.getCatNameById(catId) + "分类频道"; + titleType = CatUtil.getCatNameById(catId) + "分类频道"; } else if (Constants.NOVEL_TOP_FIELD.equals(sortBy)) { titleType = "小说排行"; } else if (ids != null) { @@ -84,7 +85,7 @@ public class BookController { for (Book book : books) { BookVO bookvo = new BookVO(); BeanUtils.copyProperties(book, bookvo); - bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid())); + bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid())); bookVoList.add(bookvo); } @@ -97,7 +98,7 @@ public class BookController { int index = idsArr.indexOf(book.getId() + ""); BookVO bookvo = new BookVO(); BeanUtils.copyProperties(book, bookvo); - bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid())); + bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid())); bookVoArr[books.size() - index - 1] = bookvo; } bookVoList = Arrays.asList(bookVoArr); @@ -144,10 +145,10 @@ public class BookController { BeanUtils.copyProperties(book, bookvo); if(catId == Constants.SOFT_NOVEL_CAT) { //轻小说 - bookvo.setCateName(bookService.getSoftCatNameById(bookvo.getSoftCat())); + bookvo.setCateName(CatUtil.getSoftCatNameById(bookvo.getSoftCat())); }else if(catId == Constants.MH_NOVEL_CAT){ //漫画 - bookvo.setCateName(bookService.getMhCatNameById(bookvo.getSoftCat())); + bookvo.setCateName(CatUtil.getMhCatNameById(bookvo.getSoftCat())); } bookVoList.add(bookvo); } @@ -204,7 +205,7 @@ public class BookController { BookVO bookvo = new BookVO(); BeanUtils.copyProperties(book, bookvo); - bookvo.setCateName(bookService.getCatNameById(bookvo.getCatid())); + bookvo.setCateName(CatUtil.getCatNameById(bookvo.getCatid())); modelMap.put("bookId", bookId); modelMap.put("book", bookvo); @@ -243,7 +244,7 @@ public class BookController { bookContent.setId(-1L); bookContent.setBookId(bookId); bookContent.setIndexNum(indexNum); - bookContent.setContent("正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新"); + bookContent.setContent(Constants.NO_CONTENT_DESC); indexName = "更新中。。。"; } else { indexName = bookService.queryIndexNameByBookIdAndIndexNum(bookId, indexNum); diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqudaoConfig.java b/novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqudaoConfig.java new file mode 100644 index 0000000..ec547ca --- /dev/null +++ b/novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqudaoConfig.java @@ -0,0 +1,27 @@ +package xyz.zinglizingli.common.config; + +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import xyz.zinglizingli.common.crawl.BaseHtmlCrawlSource; +import xyz.zinglizingli.common.crawl.BiquCrawlSource; + +/** + * @author 11797 + */ +@Slf4j +@Configuration +public class CrawlBiqudaoConfig { + + + @Bean + @ConfigurationProperties(prefix = "biqudao.crawlsource") // prefix值必须是application.yml中对应属性的前缀 + @ConditionalOnProperty(prefix = "biqudao.crawlsource",name = "enabled",havingValue = "true") + public BaseHtmlCrawlSource BiqutaCrawlSource() { + return new BiquCrawlSource(); + } + + +} diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqutaConfig.java b/novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqutaConfig.java new file mode 100644 index 0000000..6ff5b61 --- /dev/null +++ b/novel-front/src/main/java/xyz/zinglizingli/common/config/CrawlBiqutaConfig.java @@ -0,0 +1,29 @@ +package xyz.zinglizingli.common.config; + +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Primary; +import xyz.zinglizingli.common.crawl.BaseHtmlCrawlSource; +import xyz.zinglizingli.common.crawl.BiquCrawlSource; + +/** + * @author 11797 + */ +@Slf4j +@Configuration +public class CrawlBiqutaConfig { + + + @Bean + @Primary //必须加此注解,不然报错,下一个类则不需要添加 + @ConfigurationProperties(prefix = "biquta.crawlsource") // prefix值必须是application.yml中对应属性的前缀 + @ConditionalOnProperty(prefix = "biquta.crawlsource",name = "enabled",havingValue = "true") + public BaseHtmlCrawlSource BiqutaCrawlSource() { + return new BiquCrawlSource(); + } + + +} diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseCrawlSource.java b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseCrawlSource.java new file mode 100644 index 0000000..ff923cb --- /dev/null +++ b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseCrawlSource.java @@ -0,0 +1,20 @@ +package xyz.zinglizingli.common.crawl; + +import lombok.Data; +import org.springframework.beans.factory.annotation.Value; + +/** + * 爬虫源 + * @author 11797 + */ +@Data +public abstract class BaseCrawlSource { + + @Value("${books.lowestScore}") + private Float lowestScore; + + /** + * 解析数据 + * */ + public abstract void parse(); +} diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseHtmlCrawlSource.java b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseHtmlCrawlSource.java new file mode 100644 index 0000000..2bd761a --- /dev/null +++ b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseHtmlCrawlSource.java @@ -0,0 +1,81 @@ +package xyz.zinglizingli.common.crawl; + +import lombok.Data; + +/** + * html爬虫源 + * @author 11797 + */ +@Data +public abstract class BaseHtmlCrawlSource extends BaseCrawlSource{ + + /** + * 首页url + * */ + private String indexUrl; + + /** + * 列表页url + * */ + private String listPageUrl; + + /** + * 书籍url Pattern + * */ + private String bookUrlPattern; + + /** + * 评分 Pattern + * */ + private String scorePattern; + + /** + * 书名 Pattern + * */ + private String bookNamePattern; + + /** + * 作者 Pattern + * */ + private String authorPattern; + + /** + * 状态 Pattern + * */ + private String statusPattern; + + /** + * 类别 Pattern + * */ + private String catPattern; + + + /** + * 更新时间 Pattern + * */ + private String updateTimePattern; + + + /** + * 封面 Pattern + * */ + private String picPattern; + + + /** + * 简介 Pattern + * */ + private String introPattern; + + /** + * 完整目录页url Pattern + * */ + private String catalogUrlPattern; + + /** + * 目录 Pattern + * */ + private String catalogPattern; + + +} diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseJsonCrawlSource.java b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseJsonCrawlSource.java new file mode 100644 index 0000000..2132325 --- /dev/null +++ b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BaseJsonCrawlSource.java @@ -0,0 +1,8 @@ +package xyz.zinglizingli.common.crawl; + +/** + * Json爬虫源 + * @author 11797 + */ +public abstract class BaseJsonCrawlSource extends BaseCrawlSource{ +} diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BiquCrawlSource.java b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BiquCrawlSource.java new file mode 100644 index 0000000..1c8a694 --- /dev/null +++ b/novel-front/src/main/java/xyz/zinglizingli/common/crawl/BiquCrawlSource.java @@ -0,0 +1,216 @@ +package xyz.zinglizingli.common.crawl; + +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import xyz.zinglizingli.books.po.Book; +import xyz.zinglizingli.books.po.BookContent; +import xyz.zinglizingli.books.po.BookIndex; +import xyz.zinglizingli.books.service.BookService; +import xyz.zinglizingli.common.utils.CatUtil; +import xyz.zinglizingli.common.utils.ExcutorUtils; +import xyz.zinglizingli.common.utils.RestTemplateUtil; + +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static java.util.regex.Pattern.compile; + +/** + * @author 11797 + */ +@Slf4j +public class BiquCrawlSource extends BaseHtmlCrawlSource { + + + @Autowired + private BookService bookService; + + @Override + public void parse() { + + String catBookListUrl = getListPageUrl().replace("{0}", "0").replace("{1}", "1"); + String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl); + if (forObject != null) { + //解析第一页书籍的数据 + Pattern bookPatten = compile(getBookUrlPattern()); + + Matcher bookMatcher = bookPatten.matcher(forObject); + + boolean isFind = bookMatcher.find(); + Pattern scorePatten = compile(getScorePattern()); + Matcher scoreMatch = scorePatten.matcher(forObject); + boolean scoreFind = scoreMatch.find(); + + Pattern bookNamePatten = compile(getBookNamePattern()); + + Matcher bookNameMatch = bookNamePatten.matcher(forObject); + + boolean isBookNameMatch = bookNameMatch.find(); + + while (isFind && scoreFind && isBookNameMatch) { + + try { + Float score = Float.parseFloat(scoreMatch.group(1)); + + if (score < getLowestScore()) { + continue; + } + + String bokNum = bookMatcher.group(1); + String bookUrl = getIndexUrl() + "/" + bokNum + "/"; + + String body = RestTemplateUtil.getBodyByUtf8(bookUrl); + if (body != null) { + + String bookName = bookNameMatch.group(1); + Pattern authorPatten = compile(getAuthorPattern()); + Matcher authoreMatch = authorPatten.matcher(body); + if (authoreMatch.find()) { + String author = authoreMatch.group(1); + + Pattern statusPatten = compile(getStatusPattern()); + Matcher statusMatch = statusPatten.matcher(body); + if (statusMatch.find()) { + String status = statusMatch.group(1); + + Pattern catPatten = compile(getCatPattern()); + Matcher catMatch = catPatten.matcher(body); + if (catMatch.find()) { + String catName = catMatch.group(1); + int catNum = CatUtil.getCatNum(catName); + + + Pattern updateTimePatten = compile(getUpdateTimePattern()); + Matcher updateTimeMatch = updateTimePatten.matcher(body); + if (updateTimeMatch.find()) { + String updateTimeStr = updateTimeMatch.group(1); + SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); + Date updateTime = format.parse(updateTimeStr); + Pattern picPatten = compile(getPicPattern()); + Matcher picMather = picPatten.matcher(body); + if (picMather.find()) { + String picSrc = picMather.group(1); + + + Pattern descPatten = compile(getIntroPattern()); + Matcher descMatch = descPatten.matcher(body); + if (descMatch.find()) { + String desc = descMatch.group(1); + + + Book book = new Book(); + book.setAuthor(author); + book.setCatid(catNum); + book.setBookDesc(desc); + book.setBookName(bookName); + book.setScore(score > 10 ? 8.0f : score); + book.setPicUrl(picSrc); + book.setBookStatus(status); + book.setUpdateTime(updateTime); + + List indexList = new ArrayList<>(); + List contentList = new ArrayList<>(); + + //读取目录 + Pattern indexPatten = compile(getCatalogUrlPattern()); + Matcher indexMatch = indexPatten.matcher(body); + if (indexMatch.find()) { + String indexUrl = getIndexUrl() + indexMatch.group(1); + String body2 = RestTemplateUtil.getBodyByUtf8(indexUrl); + if (body2 != null) { + Pattern indexListPatten = compile(getCatalogPattern()); + Matcher indexListMatch = indexListPatten.matcher(body2); + + boolean isFindIndex = indexListMatch.find(); + + int indexNum = 0; + + //查询该书籍已存在目录号 + List hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author); + //更新和插入分别开,插入只在凌晨做一次 + if (hasIndexNum.size() > 0) { + while (isFindIndex) { + if (!hasIndexNum.contains(indexNum)) { + + String contentUrl = getIndexUrl() + indexListMatch.group(1); + String indexName = indexListMatch.group(2); + + + //查询章节内容 + String body3 = RestTemplateUtil.getBodyByUtf8(contentUrl); + if (body3 != null) { + String start = "『章节错误,点此举报』"; + String end = "『加入书签,方便阅读』"; + String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end)); + //TODO插入章节目录和章节内容 + BookIndex bookIndex = new BookIndex(); + bookIndex.setIndexName(indexName); + bookIndex.setIndexNum(indexNum); + indexList.add(bookIndex); + BookContent bookContent = new BookContent(); + bookContent.setContent(content); + bookContent.setIndexNum(indexNum); + contentList.add(bookContent); + + + } else { + break; + } + + + } + indexNum++; + isFindIndex = indexListMatch.find(); + } + + if (indexList.size() == contentList.size() && indexList.size() > 0) { + ExcutorUtils.excuteFixedTask(() -> + bookService.saveBookAndIndexAndContent(book, indexList, contentList) + ); + + } + } + } + + + } + + + } + + } + } + } + } + + + } + + } + + + } catch (Exception e) { + + e.printStackTrace(); + + } finally { + bookMatcher.find(); + isFind = bookMatcher.find(); + scoreFind = scoreMatch.find(); + isBookNameMatch = bookNameMatch.find(); + } + + + } + } + + } + + + +} diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java b/novel-front/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java index 1d377af..3c1dba5 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java +++ b/novel-front/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java @@ -3,28 +3,13 @@ package xyz.zinglizingli.common.schedule; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.commons.codec.Charsets; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.http.HttpStatus; -import org.springframework.http.ResponseEntity; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import org.springframework.web.client.RestTemplate; -import xyz.zinglizingli.books.po.Book; -import xyz.zinglizingli.books.po.BookContent; -import xyz.zinglizingli.books.po.BookIndex; import xyz.zinglizingli.books.service.BookService; -import xyz.zinglizingli.common.utils.ExcutorUtils; +import xyz.zinglizingli.common.crawl.BaseCrawlSource; import xyz.zinglizingli.common.utils.RestTemplateUtil; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import static java.util.regex.Pattern.compile; - /** * 更新书籍章节内容定时任务 * @@ -35,19 +20,8 @@ import static java.util.regex.Pattern.compile; @Slf4j public class CrawlBooksSchedule { - private final BookService bookService; - private RestTemplate utf8RestTemplate = RestTemplateUtil.getInstance(Charsets.UTF_8); - - - @Value("${books.lowestScore}") - private Float lowestScore; - - @Value("${crawl.website.type}") - private Byte websiteType; - - @Value("${pic.save.path}") - private String picSavePath; + private final BaseCrawlSource crawlSource; /** @@ -58,459 +32,11 @@ public class CrawlBooksSchedule { log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。"); - switch (websiteType) { - case 1: { - updateBiqudaoBooks(0); - break; - } - case 2: { - updateBiquTaBooks(0); - break; - } - default: { - break; - } - } + crawlSource.parse(); } - /** - * 从笔趣塔更新 - */ - private void updateBiquTaBooks(int bookClass) { - String baseUrl = "https://m.biquta.la"; - String catBookListUrlBase = baseUrl + "/class/"; - - String catBookListUrl = catBookListUrlBase + bookClass + "/" + 1 + ".html"; - String forObject = getByRestTemplate(catBookListUrl); - if (forObject != null) { - Pattern pattern = compile("value=\"(\\d+)/(\\d+)\""); - Matcher matcher = pattern.matcher(forObject); - boolean isFind = matcher.find(); - if (isFind) { - //解析第一页书籍的数据 - Pattern bookPatten = compile("href=\"/(\\d+_\\d+)/\""); - parseBiquTaBook(bookPatten, forObject, baseUrl); - } - } - } - - /** - * 解析笔趣塔数据 - */ - private void parseBiquTaBook(Pattern bookPatten, String forObject, String baseUrl) { - Matcher bookMatcher = bookPatten.matcher(forObject); - - boolean isFind = bookMatcher.find(); - Pattern scorePatten = compile("(\\d+\\.\\d+)分"); - Matcher scoreMatch = scorePatten.matcher(forObject); - boolean scoreFind = scoreMatch.find(); - - Pattern bookNamePatten = compile("

([^/]+)

"); - Matcher bookNameMatch = bookNamePatten.matcher(forObject); - boolean isBookNameMatch = bookNameMatch.find(); - - while (isFind && scoreFind && isBookNameMatch) { - - try { - Float score = Float.parseFloat(scoreMatch.group(1)); - - if (score < lowestScore) { - continue; - } - - String bokNum = bookMatcher.group(1); - String bookUrl = baseUrl + "/" + bokNum + "/"; - - String body = getByRestTemplate(bookUrl); - if (body != null) { - - String bookName = bookNameMatch.group(1); - Pattern authorPatten = compile(">作者:([^/]+)<"); - Matcher authoreMatch = authorPatten.matcher(body); - if (authoreMatch.find()) { - String author = authoreMatch.group(1); - - Pattern statusPatten = compile("状态:([^/]+)"); - Matcher statusMatch = statusPatten.matcher(body); - if (statusMatch.find()) { - String status = statusMatch.group(1); - - Pattern catPatten = compile("类别:([^/]+)"); - Matcher catMatch = catPatten.matcher(body); - if (catMatch.find()) { - String catName = catMatch.group(1); - int catNum = getCatNum(catName); - - - Pattern updateTimePatten = compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); - Matcher updateTimeMatch = updateTimePatten.matcher(body); - if (updateTimeMatch.find()) { - String updateTimeStr = updateTimeMatch.group(1); - SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); - Date updateTime = format.parse(updateTimeStr); - Pattern picPatten = compile("]+)\"\\s+onerror=\"this.src="); - Matcher picMather = picPatten.matcher(body); - if (picMather.find()) { - String picSrc = picMather.group(1); - - - Pattern descPatten = compile("class=\"review\">([^<]+)

"); - Matcher descMatch = descPatten.matcher(body); - if (descMatch.find()) { - String desc = descMatch.group(1); - - - Book book = new Book(); - book.setAuthor(author); - book.setCatid(catNum); - book.setBookDesc(desc); - book.setBookName(bookName); - book.setScore(score > 10 ? 8.0f : score); - book.setPicUrl(picSrc); - book.setBookStatus(status); - book.setUpdateTime(updateTime); - - List indexList = new ArrayList<>(); - List contentList = new ArrayList<>(); - - //读取目录 - Pattern indexPatten = compile("查看完整目录"); - Matcher indexMatch = indexPatten.matcher(body); - if (indexMatch.find()) { - String indexUrl = baseUrl + indexMatch.group(1); - String body2 = getByRestTemplate(indexUrl); - if (body2 != null) { - Pattern indexListPatten = compile("([^/]+)"); - Matcher indexListMatch = indexListPatten.matcher(body2); - - boolean isFindIndex = indexListMatch.find(); - - int indexNum = 0; - - //查询该书籍已存在目录号 - List hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author); - //更新和插入分别开,插入只在凌晨做一次 - if (hasIndexNum.size() > 0) { - while (isFindIndex) { - if (!hasIndexNum.contains(indexNum)) { - - String contentUrl = baseUrl + indexListMatch.group(1); - String indexName = indexListMatch.group(2); - - - //查询章节内容 - String body3 = getByRestTemplate(contentUrl); - if (body3 != null) { - String start = "『章节错误,点此举报』"; - String end = "『加入书签,方便阅读』"; - String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end)); - //TODO插入章节目录和章节内容 - BookIndex bookIndex = new BookIndex(); - bookIndex.setIndexName(indexName); - bookIndex.setIndexNum(indexNum); - indexList.add(bookIndex); - BookContent bookContent = new BookContent(); - bookContent.setContent(content); - bookContent.setIndexNum(indexNum); - contentList.add(bookContent); - - - } else { - break; - } - - - } - indexNum++; - isFindIndex = indexListMatch.find(); - } - - if (indexList.size() == contentList.size() && indexList.size() > 0) { - ExcutorUtils.excuteFixedTask(() -> - bookService.saveBookAndIndexAndContent(book, indexList, contentList) - ); - - } - } - } - - - } - - - } - - } - } - } - } - - - } - - } - - - } catch (Exception e) { - - e.printStackTrace(); - - } finally { - bookMatcher.find(); - isFind = bookMatcher.find(); - scoreFind = scoreMatch.find(); - isBookNameMatch = bookNameMatch.find(); - } - - - } - } - - /** - * 从笔趣岛更新 - */ - private void updateBiqudaoBooks(int bookClass) { - String baseUrl = "https://m.biqudao.com"; - String catBookListUrlBase = baseUrl + "/bqgeclass/"; - - int page = 1; - String catBookListUrl = catBookListUrlBase + bookClass + "/" + page + ".html"; - String forObject = getByRestTemplate(catBookListUrl); - if (forObject != null) { - Pattern pattern = compile("value=\"(\\d+)/(\\d+)\""); - Matcher matcher = pattern.matcher(forObject); - boolean isFind = matcher.find(); - if (isFind) { - //解析第一页书籍的数据 - Pattern bookPatten = compile("href=\"/(bqge\\d+)/\""); - parseBiquDaoBook(bookPatten, forObject, baseUrl); - } - } - - - } - - - /** - * 解析笔趣岛数据 - */ - private void parseBiquDaoBook(Pattern bookPatten, String forObject, String baseUrl) { - - Matcher bookMatcher = bookPatten.matcher(forObject); - boolean isFind = bookMatcher.find(); - Pattern scorePatten = compile("(\\d+\\.\\d+)分"); - Matcher scoreMatch = scorePatten.matcher(forObject); - boolean scoreFind = scoreMatch.find(); - - Pattern bookNamePatten = compile("

([^/]+)

"); - Matcher bookNameMatch = bookNamePatten.matcher(forObject); - boolean isBookNameMatch = bookNameMatch.find(); - - while (isFind && scoreFind && isBookNameMatch) { - - try { - Float score = Float.parseFloat(scoreMatch.group(1)); - - if (score < lowestScore) { - continue; - } - - String bokNum = bookMatcher.group(1); - String bookUrl = baseUrl + "/" + bokNum + "/"; - - String body = getByRestTemplate(bookUrl); - if (body != null) { - - String bookName = bookNameMatch.group(1); - Pattern authorPatten = compile("
  • 作者:([^/]+)
  • "); - Matcher authoreMatch = authorPatten.matcher(body); - if (authoreMatch.find()) { - String author = authoreMatch.group(1); - - Pattern statusPatten = compile("状态:([^/]+)"); - Matcher statusMatch = statusPatten.matcher(body); - if (statusMatch.find()) { - String status = statusMatch.group(1); - - Pattern catPatten = compile("类别:([^/]+)"); - Matcher catMatch = catPatten.matcher(body); - if (catMatch.find()) { - String catName = catMatch.group(1); - int catNum = getCatNum(catName); - Pattern updateTimePatten = compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); - Matcher updateTimeMatch = updateTimePatten.matcher(body); - if (updateTimeMatch.find()) { - String updateTimeStr = updateTimeMatch.group(1); - SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); - Date updateTime = format.parse(updateTimeStr); - Pattern picPatten = compile("]+)\"\\s+onerror=\"this.src="); - Matcher picMather = picPatten.matcher(body); - if (picMather.find()) { - String picSrc = picMather.group(1); - - - Pattern descPatten = compile("class=\"review\">([^<]+)

    "); - Matcher descMatch = descPatten.matcher(body); - if (descMatch.find()) { - String desc = descMatch.group(1); - - - Book book = new Book(); - book.setAuthor(author); - book.setCatid(catNum); - book.setBookDesc(desc); - book.setBookName(bookName); - book.setScore(score > 10 ? 8.0f : score); - book.setPicUrl(picSrc); - book.setBookStatus(status); - book.setUpdateTime(updateTime); - - List indexList = new ArrayList<>(); - List contentList = new ArrayList<>(); - - //读取目录 - Pattern indexPatten = compile("查看完整目录"); - Matcher indexMatch = indexPatten.matcher(body); - if (indexMatch.find()) { - String indexUrl = baseUrl + indexMatch.group(1); - String body2 = getByRestTemplate(indexUrl); - if (body2 != null) { - Pattern indexListPatten = compile("([^/]+)"); - Matcher indexListMatch = indexListPatten.matcher(body2); - - boolean isFindIndex = indexListMatch.find(); - - int indexNum = 0; - - //查询该书籍已存在目录号 - List hasIndexNum = bookService.queryIndexNumByBookNameAndAuthor(bookName, author); - //只更新已存在的书籍 - if (hasIndexNum.size() > 0) { - while (isFindIndex) { - if (!hasIndexNum.contains(indexNum)) { - - String contentUrl = baseUrl + indexListMatch.group(1); - String indexName = indexListMatch.group(2); - - - //查询章节内容 - String body3 = getByRestTemplate(contentUrl); - if (body3 != null) { - String start = "『章节错误,点此举报』"; - String end = "『加入书签,方便阅读』"; - String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end)); - //TODO插入章节目录和章节内容 - BookIndex bookIndex = new BookIndex(); - bookIndex.setIndexName(indexName); - bookIndex.setIndexNum(indexNum); - indexList.add(bookIndex); - BookContent bookContent = new BookContent(); - bookContent.setContent(content); - bookContent.setIndexNum(indexNum); - contentList.add(bookContent); - - - } else { - break; - } - - - } - indexNum++; - isFindIndex = indexListMatch.find(); - } - - if (indexList.size() == contentList.size() && indexList.size() > 0) { - ExcutorUtils.excuteFixedTask(() -> bookService.saveBookAndIndexAndContent(book, indexList, contentList)); - - } - } - } - - - } - - - } - - - } - } - } - } - - - } - - } - - - } catch (Exception e) { - - e.printStackTrace(); - - } finally { - bookMatcher.find(); - isFind = bookMatcher.find(); - scoreFind = scoreMatch.find(); - isBookNameMatch = bookNameMatch.find(); - } - - - } - - } - - private int getCatNum(String catName) { - int catNum; - switch (catName) { - case "武侠仙侠": { - catNum = 2; - break; - } - case "都市言情": { - catNum = 3; - break; - } - case "历史军事": { - catNum = 4; - break; - } - case "科幻灵异": { - catNum = 5; - break; - } - case "网游竞技": { - catNum = 6; - break; - } - case "女生频道": { - catNum = 7; - break; - } - default: { - catNum = 1; - break; - } - } - return catNum; - } - - private String getByRestTemplate(String url) { - try { - ResponseEntity forEntity = utf8RestTemplate.getForEntity(url, String.class); - if (forEntity.getStatusCode() == HttpStatus.OK) { - return forEntity.getBody(); - } else { - return null; - } - } catch (Exception e) { - log.error(e.getMessage(), e); - return null; - } - } } diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/utils/CatUtil.java b/novel-front/src/main/java/xyz/zinglizingli/common/utils/CatUtil.java new file mode 100644 index 0000000..bc0cb1d --- /dev/null +++ b/novel-front/src/main/java/xyz/zinglizingli/common/utils/CatUtil.java @@ -0,0 +1,179 @@ +package xyz.zinglizingli.common.utils; + +/** + * @author 11797 + */ +public class CatUtil { + + public static int getCatNum(String catName) { + int catNum; + switch (catName) { + case "武侠仙侠": { + catNum = 2; + break; + } + case "都市言情": { + catNum = 3; + break; + } + case "历史军事": { + catNum = 4; + break; + } + case "科幻灵异": { + catNum = 5; + break; + } + case "网游竞技": { + catNum = 6; + break; + } + case "女生频道": { + catNum = 7; + break; + } + default: { + catNum = 1; + break; + } + } + return catNum; + } + + + + /** + * 查询轻小说分类名 + * */ + public static String getSoftCatNameById(Integer softCat) { + String catName = "其他"; + + switch (softCat) { + case 21: { + catName = "魔幻"; + break; + } + case 22: { + catName = "玄幻"; + break; + } + case 23: { + catName = "古风"; + break; + } + case 24: { + catName = "科幻"; + break; + } + case 25: { + catName = "校园"; + break; + } + case 26: { + catName = "都市"; + break; + } + case 27: { + catName = "游戏"; + break; + } + case 28: { + catName = "同人"; + break; + } + case 29: { + catName = "悬疑"; + break; + } + case 0: { + catName = "动漫"; + break; + } + default: { + break; + } + + + } + return catName; + + } + + /** + * 查询漫画分类名 + * */ + public static String getMhCatNameById(Integer softCat) { + String catName = "其他"; + + switch (softCat) { + case 3262: { + catName = "少年漫"; + break; + } + case 3263: { + catName = "少女漫"; + break; + } + default: { + break; + } + + + } + return catName; + + } + + + /** + * 获取分类名 + * */ + public static String getCatNameById(Integer catid) { + String catName = "其他"; + + switch (catid) { + case 1: { + catName = "玄幻奇幻"; + break; + } + case 2: { + catName = "武侠仙侠"; + break; + } + case 3: { + catName = "都市言情"; + break; + } + case 4: { + catName = "历史军事"; + break; + } + case 5: { + catName = "科幻灵异"; + break; + } + case 6: { + catName = "网游竞技"; + break; + } + case 7: { + catName = "女生频道"; + break; + } + case 8: { + catName = "轻小说"; + break; + } + case 9: { + catName = "漫画"; + break; + } + default: { + break; + } + + + } + return catName; + } +} diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/utils/Constants.java b/novel-front/src/main/java/xyz/zinglizingli/common/utils/Constants.java index ac4ed0a..69b2d10 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/common/utils/Constants.java +++ b/novel-front/src/main/java/xyz/zinglizingli/common/utils/Constants.java @@ -85,4 +85,9 @@ public class Constants { * 多本书籍ID分隔符 * */ public static final String BOOK_ID_SEPARATOR = "-"; + + /** + * 没有内容的描述 + * */ + public static final String NO_CONTENT_DESC = "正在手打中,请稍等片刻,内容更新后,需要重新刷新页面,才能获取最新更新"; } diff --git a/novel-front/src/main/java/xyz/zinglizingli/common/utils/RestTemplateUtil.java b/novel-front/src/main/java/xyz/zinglizingli/common/utils/RestTemplateUtil.java index b956290..19046c4 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/common/utils/RestTemplateUtil.java +++ b/novel-front/src/main/java/xyz/zinglizingli/common/utils/RestTemplateUtil.java @@ -1,5 +1,9 @@ package xyz.zinglizingli.common.utils; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.codec.Charsets; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; import org.springframework.http.client.HttpComponentsClientHttpRequestFactory; import org.springframework.http.converter.HttpMessageConverter; import org.springframework.http.converter.StringHttpMessageConverter; @@ -10,6 +14,10 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +/** + * @author 11797 + */ +@Slf4j public class RestTemplateUtil { private static Map restTemplateMap = new HashMap<>(); @@ -35,4 +43,18 @@ public class RestTemplateUtil { return restTemplate; } + public static String getBodyByUtf8(String url) { + try { + ResponseEntity forEntity = getInstance(Charsets.UTF_8).getForEntity(url, String.class); + if (forEntity.getStatusCode() == HttpStatus.OK) { + return forEntity.getBody(); + } else { + return null; + } + } catch (Exception e) { + log.error(e.getMessage(), e); + return null; + } + } + } diff --git a/novel-front/src/main/resources/application-crawl.yml b/novel-front/src/main/resources/application-crawl.yml new file mode 100644 index 0000000..438ef8b --- /dev/null +++ b/novel-front/src/main/resources/application-crawl.yml @@ -0,0 +1,33 @@ +#爬取的网站名称类型 1:笔趣岛 ,2:笔趣塔 更多网站解析中,敬请期待 +biquta: + crawlsource: + enabled: true #是否开启此爬虫源 + index-url: https://m.biquta.la + list-page-url: https://m.biquta.la/class/{0}/{1}.html + book-url-pattern: href="/(\d+_\d+)/" + score-pattern: (\d+\.\d+)分 + book-name-pattern:

    ([^/]+)

    + author-pattern: 作者:([^/]+)< + status-pattern: 状态:([^/]+) + cat-pattern: 类别:([^/]+) + update-time-pattern: 更新:(\d+-\d+-\d+\s\d+:\d+:\d+) + pic-pattern: ([^<]+)

    + catalog-url-pattern: 查看完整目录 + catalog-pattern: ([^/]+) +biqudao: + crawlsource: + enabled: true #是否开启此爬虫源 + index-url: https://m.biqudao.com + list-page-url: https://m.biqudao.com/bqgeclass/{0}/{1}.html + book-url-pattern: href="/(bqge\d+)/" + score-pattern: (\d+\.\d+)分 + book-name-pattern:

    ([^/]+)

    + author-pattern:
  • 作者:([^/]+)
  • + status-pattern: 状态:([^/]+) + cat-pattern: 类别:([^/]+) + update-time-pattern: 更新:(\d+-\d+-\d+\s\d+:\d+:\d+) + pic-pattern: ([^<]+)

    + catalog-url-pattern: 查看完整目录 + catalog-pattern: ([^/]+) \ No newline at end of file diff --git a/novel-front/src/main/resources/application.yml b/novel-front/src/main/resources/application.yml index dd5fe56..3298845 100644 --- a/novel-front/src/main/resources/application.yml +++ b/novel-front/src/main/resources/application.yml @@ -4,8 +4,8 @@ server: spring: datasource: url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai - username: books - password: books + username: root + password: test123456 # url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai # username: root # password: test123456 @@ -39,6 +39,8 @@ spring: port: 465 class: javax.net.ssl.SSLSocketFactory fallback: false + profiles: + include: crawl