From c1bca5442bf327f912c536ee57a0e486e446a584 Mon Sep 17 00:00:00 2001
From: xxy <1179705413@qq.com>
Date: Thu, 14 Nov 2019 08:01:39 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=A4=9A=E7=88=AC=E8=99=AB?=
=?UTF-8?q?=E6=BA=90=E7=9A=84=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../common/schedule/CrawlBooksSchedule.java | 543 ++++++++----------
src/main/resources/application.yml | 6 +-
2 files changed, 242 insertions(+), 307 deletions(-)
diff --git a/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java b/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java
index fffd6a5..51cce0c 100644
--- a/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java
+++ b/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java
@@ -37,328 +37,259 @@ public class CrawlBooksSchedule {
@Value("${books.lowestScore}")
private Float lowestScore;
+ @Value("${crawl.website.type}")
+ private Byte websiteType;
+
private boolean isExcuting = false;
- public static void main(String[] args) {//
- String forObject = "";
-
- Pattern pattern = Pattern.compile("
-//
-//
-//
-//
-//
- //第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小
- for (int i = 1; i <= 7; i++) {
-
- int finalI = i;
- new Thread(
- () -> {
-
- try {
- //拼接分类URL
- int page = 1;//起始页码
- int totalPage = page;
- String catBookListUrl = baseUrl + "/class/" + finalI + "/" + page + ".html";
- ResponseEntity forEntity = restTemplate.getForEntity(catBookListUrl, String.class);
- if (forEntity.getStatusCode() == HttpStatus.OK) {
- String forObject = forEntity.getBody();
- //匹配分页数
- Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
- Matcher matcher = pattern.matcher(forObject);
- boolean isFind = matcher.find();
- System.out.println("匹配分页数" + isFind);
- if (isFind) {
- int currentPage = Integer.parseInt(matcher.group(1));
- totalPage = Integer.parseInt(matcher.group(2));
- //解析第一页书籍的数据
- Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\"");
- parseBook(bookPatten, forObject, restTemplate, finalI, baseUrl);
- /*while (currentPage < totalPage) {
- catBookListUrl = baseUrl + "/class/" + finalI + "/" + (currentPage + 1) + ".html";
- forEntity = restTemplate.getForEntity(catBookListUrl, String.class);
- if (forEntity.getStatusCode() == HttpStatus.OK) {
- forObject = forEntity.getBody();
- //匹配分页数
- matcher = pattern.matcher(forObject);
- isFind = matcher.find();
-
- if (isFind) {
- currentPage = Integer.parseInt(matcher.group(1));
- totalPage = Integer.parseInt(matcher.group(2));
- parseBook(bookPatten, forObject, restTemplate, finalI, baseUrl);
- }
- }else{
- currentPage++;
- }
- }*/
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- }
- ).start();
-
-
- }
-
-
- System.out.println("结束时间" + new Date());
-
- }
-
-
- // @Scheduled(fixedRate = 1000 * 60 * 30)
- //@Scheduled(fixedRate = 1000 * 60 * 60 * 3)//本机更新,否则服务器压力过大,等书籍多了之后,再去服务器更新,这样更新数量不会很大
- //暂定2小说,只爬分类前3本书,一共3*7=21本书,爬等以后书籍多了之后,会适当缩短更新间隔
- public void crawBqugeTaBooks() throws Exception {
- if (!isExcuting) {
- isExcuting = true;
- final String baseUrl = "https://m.biquta.com";
- log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。");
-
-
-//①爬分类列表的书籍url和总页数
-// https:
-////m.biquta.com/class/1/1.html
-// https:
-////m.biquta.com/class/2/1.html
-// https:
-////m.biquta.com/class/2/2.html
-//
-//
-// https:
-////m.biquta.com/class/2/2.html
-//
-//
-//
-//
-//
-//
- //第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小
- RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
- while (true) {
- log.debug("crawlBooksSchedule循环执行中。。。。。。。。。。。。");
- Set classIdSet = new HashSet<>();
- for (int i = 1; i <= 7; i++) {
-
- log.debug("crawlBooksSchedule分类" + i + "执行中。。。。。。。。。。。。");
-
- // int finalI = i;
- /* new Thread(
- () -> {*/
-
- try {
- //先随机更新分类
- Random random = new Random();
- int finalI = random.nextInt(7) + 1;
- if (classIdSet.contains(finalI)) {
- finalI = random.nextInt(7) + 1;
- }
- classIdSet.add(finalI);
- //拼接分类URL
- int page = 1;//起始页码
- int totalPage = page;
- String catBookListUrl = baseUrl + "/class/" + finalI + "/" + page + ".html";
- String forObject = getByHttpClient(catBookListUrl);
- if (forObject != null) {
- //匹配分页数
- Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
- Matcher matcher = pattern.matcher(forObject);
- boolean isFind = matcher.find();
- System.out.println("匹配分页数" + isFind);
- if (isFind) {
- int currentPage = Integer.parseInt(matcher.group(1));
- totalPage = Integer.parseInt(matcher.group(2));
- //解析第一页书籍的数据
- Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\"");
- parseBook(bookPatten, forObject, restTemplate, finalI, baseUrl);
- /* while (currentPage < totalPage) {
- catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + (currentPage + 1) + ".html";
- forObject = getByHttpClient(catBookListUrl);
- if (forObject != null) {
- //匹配分页数
- matcher = pattern.matcher(forObject);
- isFind = matcher.find();
-
- if (isFind) {
- currentPage = Integer.parseInt(matcher.group(1));
- totalPage = Integer.parseInt(matcher.group(2));
- parseBiquge11Book(bookPatten, forObject, finalI, baseUrl);
- }
- } else {
- currentPage++;
- }
- }*/
- }
- }
- Thread.sleep(1000 * 60 * 10);
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- /* }
- ).start();*/
-
-
- }
- }
-
- }
-
-
- }
-
-
- //@Scheduled(fixedRate = 1000 * 60 * 35)
@Scheduled(fixedRate = 1000 * 60 * 60 * 3)
- //暂定2小说,只爬分类前3本书,一共3*7=21本书,爬等以后书籍多了之后,会适当缩短更新间隔
public void crawBquge11BooksAtDay() throws Exception {
if (!isExcuting) {
isExcuting = true;
- final String baseUrl = "https://m.biqudao.com";
log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。");
-
-//①爬分类列表的书籍url和总页数
-// https:
-////m.biquta.com/class/1/1.html
-// https:
-////m.biquta.com/class/2/1.html
-// https:
-////m.biquta.com/class/2/2.html
-//
-//
-// https:
-////m.biquta.com/class/2/2.html
-//
-//
-//
-//
-//
-//
- //第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小
while (true) {
- log.debug("crawlBooksSchedule循环执行中。。。。。。。。。。。。");
- //List classIdList = new ArrayList<>(Arrays.asList(new Integer[]{1,2,3,4,5,6,7}));
- // for (int i = 1; i <= 7; i++) {
-
- // log.debug("crawlBooksSchedule分类"+i+"执行中。。。。。。。。。。。。");
-
- // int finalI = i;
- /* new Thread(
- () -> {*/
try {
- //先随机更新分类
- //Random random = new Random();
- //int finalI = classIdList.get(new Random().nextInt(classIdList.size()));
- //classIdList.remove(finalI);
- int finalI = 0;
- //拼接分类URL
- int page = 1;//起始页码
- int totalPage = page;
- String catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + page + ".html";
- String forObject = getByHttpClient(catBookListUrl);
- if (forObject != null) {
- //匹配分页数
- Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
- Matcher matcher = pattern.matcher(forObject);
- boolean isFind = matcher.find();
- System.out.println("匹配分页数" + isFind);
- if (isFind) {
- int currentPage = Integer.parseInt(matcher.group(1));
- totalPage = Integer.parseInt(matcher.group(2));
- //解析第一页书籍的数据
- Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\"");
- //白天更新
- parseBiquge11Book(bookPatten, forObject, finalI, baseUrl, true);
- /* while (currentPage < totalPage) {
- catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + (currentPage + 1) + ".html";
- forObject = getByHttpClient(catBookListUrl);
- if (forObject != null) {
- //匹配分页数
- matcher = pattern.matcher(forObject);
- isFind = matcher.find();
-
- if (isFind) {
- currentPage = Integer.parseInt(matcher.group(1));
- totalPage = Integer.parseInt(matcher.group(2));
- parseBiquge11Book(bookPatten, forObject, finalI, baseUrl);
- }
- } else {
- currentPage++;
- }
- }*/
+ switch (websiteType) {
+ case 1: {
+ updateBiqudaoBooks(0);
+ break;
+ }
+ case 2: {
+ updateBiquTaBooks(0);
+ break;
}
}
Thread.sleep(1000 * 60 * 5);
} catch (Exception e) {
e.printStackTrace();
}
-
- /* }
- ).start();*/
-
-
- // }
}
+
+ }
+
+
+ }
+
+ private void updateBiquTaBooks(int finalI) {
+ String baseUrl = "https://m.biquta.com";
+ String catBookListUrlBase = baseUrl + "/class/";
+
+ int page = 1;//起始页码
+ int totalPage = page;
+ String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html";
+ String forObject = getByHttpClient(catBookListUrl);
+ if (forObject != null) {
+ //匹配分页数
+ Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
+ Matcher matcher = pattern.matcher(forObject);
+ boolean isFind = matcher.find();
+ System.out.println("匹配分页数" + isFind);
+ if (isFind) {
+ int currentPage = Integer.parseInt(matcher.group(1));
+ totalPage = Integer.parseInt(matcher.group(2));
+ //解析第一页书籍的数据
+ Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\"");
+ parseBiquTaBook(bookPatten, forObject, finalI, baseUrl, true);
+ }
+ }
+ }
+
+ private void parseBiquTaBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) {
+ Matcher matcher2 = bookPatten.matcher(forObject);
+ boolean isFind = matcher2.find();
+ Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分
");
+ Matcher scoreMatch = scorePatten.matcher(forObject);
+ boolean scoreFind = scoreMatch.find();
+
+ Pattern bookNamePatten = Pattern.compile("([^/]+)
");
+ Matcher bookNameMatch = bookNamePatten.matcher(forObject);
+ boolean isBookNameMatch = bookNameMatch.find();
+
+
+ System.out.println("匹配书籍url" + isFind);
+
+ System.out.println("匹配分数" + scoreFind);
+
+ while (isFind && scoreFind && isBookNameMatch) {
+
+ try {
+ Float score = Float.parseFloat(scoreMatch.group(1));
+
+ if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说
+ continue;
+ }
+
+ String bokNum = matcher2.group(1);
+ String bookUrl = baseUrl + "/" + bokNum + "/";
+
+ String body = getByHttpClient(bookUrl);
+ if (body != null) {
+
+ String bookName = bookNameMatch.group(1);
+ Pattern authorPatten = Pattern.compile(">作者:([^/]+)<");
+ Matcher authoreMatch = authorPatten.matcher(body);
+ if (authoreMatch.find()) {
+ String author = authoreMatch.group(1);
+
+ Pattern statusPatten = Pattern.compile("状态:([^/]+)");
+ Matcher statusMatch = statusPatten.matcher(body);
+ if (statusMatch.find()) {
+ String status = statusMatch.group(1);
+
+ Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)");
+ Matcher updateTimeMatch = updateTimePatten.matcher(body);
+ if (updateTimeMatch.find()) {
+ String updateTimeStr = updateTimeMatch.group(1);
+ SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
+ Date updateTime = format.parse(updateTimeStr);
+ Pattern picPatten = Pattern.compile("
]+)\"\\s+onerror=\"this.src=");
+ Matcher picMather = picPatten.matcher(body);
+ if (picMather.find()) {
+ String picSrc = picMather.group(1);
+
+ Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)");
+ Matcher descMatch = descPatten.matcher(body);
+ if (descMatch.find()) {
+ String desc = descMatch.group(1);
+
+
+ Book book = new Book();
+ book.setAuthor(author);
+ book.setCatid(catNum);
+ book.setBookDesc(desc);
+ book.setBookName(bookName);
+ book.setScore(score > 10 ? 8.0f : score);
+ book.setPicUrl(picSrc);
+ book.setBookStatus(status);
+ book.setUpdateTime(updateTime);
+
+ List indexList = new ArrayList<>();
+ List contentList = new ArrayList<>();
+
+ //读取目录
+ Pattern indexPatten = Pattern.compile("查看完整目录");
+ Matcher indexMatch = indexPatten.matcher(body);
+ if (indexMatch.find()) {
+ String indexUrl = baseUrl + indexMatch.group(1);
+ String body2 = getByHttpClient(indexUrl);
+ if (body2 != null) {
+ Pattern indexListPatten = Pattern.compile("([^/]+)");
+ Matcher indexListMatch = indexListPatten.matcher(body2);
+
+ boolean isFindIndex = indexListMatch.find();
+
+ int indexNum = 0;
+
+ //查询该书籍已存在目录号
+ List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author);
+ //更新和插入分别开,插入只在凌晨做一次
+ if ((isUpdate && hasIndexNum.size() > 0) || (!isUpdate && hasIndexNum.size() == 0)) {
+ while (isFindIndex) {
+ if (!hasIndexNum.contains(indexNum)) {
+
+ String contentUrl = baseUrl + indexListMatch.group(1);
+ String indexName = indexListMatch.group(2);
+
+
+ //查询章节内容
+ String body3 = getByHttpClient(contentUrl);
+ if (body3 != null) {
+ Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读");
+ String start = "『章节错误,点此举报』";
+ String end = "『加入书签,方便阅读』";
+ String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end));
+ //TODO插入章节目录和章节内容
+ BookIndex bookIndex = new BookIndex();
+ bookIndex.setIndexName(indexName);
+ bookIndex.setIndexNum(indexNum);
+ indexList.add(bookIndex);
+ BookContent bookContent = new BookContent();
+ bookContent.setContent(content);
+ bookContent.setIndexNum(indexNum);
+ contentList.add(bookContent);
+
+
+ } else {
+ break;
+ }
+
+
+ }
+ indexNum++;
+ isFindIndex = indexListMatch.find();
+ }
+
+ if (indexList.size() == contentList.size() && indexList.size() > 0) {
+ ExcutorUtils.excuteFixedTask(new Runnable() {
+ @Override
+ public void run() {
+ bookService.saveBookAndIndexAndContent(book, indexList, contentList);
+ }
+ });
+
+ }
+ }
+ }
+
+
+ }
+
+
+ }
+
+
+ }
+ }
+ }
+
+
+ }
+
+ }
+
+
+ } catch (Exception e) {
+
+ e.printStackTrace();
+
+ } finally {
+ matcher2.find();
+ isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配
+ scoreFind = scoreMatch.find();
+ isBookNameMatch = bookNameMatch.find();
+ }
+
+
+ }
+ }
+
+ private void updateBiqudaoBooks(int finalI) {
+ String baseUrl = "https://m.biqudao.com";
+ String catBookListUrlBase = baseUrl + "/bqgeclass/";
+
+ int page = 1;//起始页码
+ int totalPage = page;
+ String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html";
+ String forObject = getByHttpClient(catBookListUrl);
+ if (forObject != null) {
+ //匹配分页数
+ Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
+ Matcher matcher = pattern.matcher(forObject);
+ boolean isFind = matcher.find();
+ System.out.println("匹配分页数" + isFind);
+ if (isFind) {
+ int currentPage = Integer.parseInt(matcher.group(1));
+ totalPage = Integer.parseInt(matcher.group(2));
+ //解析第一页书籍的数据
+ Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\"");
+ //白天更新
+ parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, true);
+ }
}
@@ -431,7 +362,7 @@ public class CrawlBooksSchedule {
//解析第一页书籍的数据
Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\"");
//晚上插入
- parseBiquge11Book(bookPatten, forObject, finalI, baseUrl, false);
+ parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false);
while (currentPage < totalPage) {
if (new Date().getHours() > 5) {
break;
@@ -446,7 +377,7 @@ public class CrawlBooksSchedule {
if (isFind) {
currentPage = Integer.parseInt(matcher.group(1));
totalPage = Integer.parseInt(matcher.group(2));
- parseBiquge11Book(bookPatten, forObject, finalI, baseUrl, false);
+ parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false);
}
} else {
currentPage++;
@@ -467,7 +398,7 @@ public class CrawlBooksSchedule {
}
- private void parseBiquge11Book(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) {
+ private void parseBiquDaoBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) {
Matcher matcher2 = bookPatten.matcher(forObject);
boolean isFind = matcher2.find();
@@ -570,7 +501,7 @@ public class CrawlBooksSchedule {
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
- book.setScore(score>10?8.0f:score);
+ book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
@@ -769,7 +700,7 @@ public class CrawlBooksSchedule {
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
- book.setScore(score>10?8.0f:score);
+ book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
index 848e56c..9a41764 100644
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@@ -64,8 +64,12 @@ mysql:
#爬取小说数据的最低评分
books:
- lowestScore: 8.0
+ lowestScore: 6.0
+#爬取的网站名称类型 1:笔趣岛 ,2:笔趣塔 更多网站解析中,敬请期待
+crawl:
+ website:
+ type: 1
search:
schedule: