From c1bca5442bf327f912c536ee57a0e486e446a584 Mon Sep 17 00:00:00 2001 From: xxy <1179705413@qq.com> Date: Thu, 14 Nov 2019 08:01:39 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=A4=9A=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E6=BA=90=E7=9A=84=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../common/schedule/CrawlBooksSchedule.java | 543 ++++++++---------- src/main/resources/application.yml | 6 +- 2 files changed, 242 insertions(+), 307 deletions(-) diff --git a/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java b/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java index fffd6a5..51cce0c 100644 --- a/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java +++ b/src/main/java/xyz/zinglizingli/common/schedule/CrawlBooksSchedule.java @@ -37,328 +37,259 @@ public class CrawlBooksSchedule { @Value("${books.lowestScore}") private Float lowestScore; + @Value("${crawl.website.type}") + private Byte websiteType; + private boolean isExcuting = false; - public static void main(String[] args) {// - String forObject = ""; - - Pattern pattern = Pattern.compile(" -// -// -//
-// -//
-//

苍穹九变

-//

作者:风起闲云

-//
-//
7.5 分
-//
-//
-// -// - //第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小 - for (int i = 1; i <= 7; i++) { - - int finalI = i; - new Thread( - () -> { - - try { - //拼接分类URL - int page = 1;//起始页码 - int totalPage = page; - String catBookListUrl = baseUrl + "/class/" + finalI + "/" + page + ".html"; - ResponseEntity forEntity = restTemplate.getForEntity(catBookListUrl, String.class); - if (forEntity.getStatusCode() == HttpStatus.OK) { - String forObject = forEntity.getBody(); - //匹配分页数 - Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); - Matcher matcher = pattern.matcher(forObject); - boolean isFind = matcher.find(); - System.out.println("匹配分页数" + isFind); - if (isFind) { - int currentPage = Integer.parseInt(matcher.group(1)); - totalPage = Integer.parseInt(matcher.group(2)); - //解析第一页书籍的数据 - Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\""); - parseBook(bookPatten, forObject, restTemplate, finalI, baseUrl); - /*while (currentPage < totalPage) { - catBookListUrl = baseUrl + "/class/" + finalI + "/" + (currentPage + 1) + ".html"; - forEntity = restTemplate.getForEntity(catBookListUrl, String.class); - if (forEntity.getStatusCode() == HttpStatus.OK) { - forObject = forEntity.getBody(); - //匹配分页数 - matcher = pattern.matcher(forObject); - isFind = matcher.find(); - - if (isFind) { - currentPage = Integer.parseInt(matcher.group(1)); - totalPage = Integer.parseInt(matcher.group(2)); - parseBook(bookPatten, forObject, restTemplate, finalI, baseUrl); - } - }else{ - currentPage++; - } - }*/ - } - } - } catch (Exception e) { - e.printStackTrace(); - } - - } - ).start(); - - - } - - - System.out.println("结束时间" + new Date()); - - } - - - // @Scheduled(fixedRate = 1000 * 60 * 30) - //@Scheduled(fixedRate = 1000 * 60 * 60 * 3)//本机更新,否则服务器压力过大,等书籍多了之后,再去服务器更新,这样更新数量不会很大 - //暂定2小说,只爬分类前3本书,一共3*7=21本书,爬等以后书籍多了之后,会适当缩短更新间隔 - public void crawBqugeTaBooks() throws Exception { - if (!isExcuting) { - isExcuting = true; - final String baseUrl = "https://m.biquta.com"; - log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。"); - - -//①爬分类列表的书籍url和总页数 -// https: -////m.biquta.com/class/1/1.html -// https: -////m.biquta.com/class/2/1.html -// https: -////m.biquta.com/class/2/2.html -// -// -// https: -////m.biquta.com/class/2/2.html -// -// -// -//
-// -//
-//

苍穹九变

-//

作者:风起闲云

-//
-//
7.5 分
-//
-//
-// -// - //第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小 - RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); - while (true) { - log.debug("crawlBooksSchedule循环执行中。。。。。。。。。。。。"); - Set classIdSet = new HashSet<>(); - for (int i = 1; i <= 7; i++) { - - log.debug("crawlBooksSchedule分类" + i + "执行中。。。。。。。。。。。。"); - - // int finalI = i; - /* new Thread( - () -> {*/ - - try { - //先随机更新分类 - Random random = new Random(); - int finalI = random.nextInt(7) + 1; - if (classIdSet.contains(finalI)) { - finalI = random.nextInt(7) + 1; - } - classIdSet.add(finalI); - //拼接分类URL - int page = 1;//起始页码 - int totalPage = page; - String catBookListUrl = baseUrl + "/class/" + finalI + "/" + page + ".html"; - String forObject = getByHttpClient(catBookListUrl); - if (forObject != null) { - //匹配分页数 - Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); - Matcher matcher = pattern.matcher(forObject); - boolean isFind = matcher.find(); - System.out.println("匹配分页数" + isFind); - if (isFind) { - int currentPage = Integer.parseInt(matcher.group(1)); - totalPage = Integer.parseInt(matcher.group(2)); - //解析第一页书籍的数据 - Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\""); - parseBook(bookPatten, forObject, restTemplate, finalI, baseUrl); - /* while (currentPage < totalPage) { - catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + (currentPage + 1) + ".html"; - forObject = getByHttpClient(catBookListUrl); - if (forObject != null) { - //匹配分页数 - matcher = pattern.matcher(forObject); - isFind = matcher.find(); - - if (isFind) { - currentPage = Integer.parseInt(matcher.group(1)); - totalPage = Integer.parseInt(matcher.group(2)); - parseBiquge11Book(bookPatten, forObject, finalI, baseUrl); - } - } else { - currentPage++; - } - }*/ - } - } - Thread.sleep(1000 * 60 * 10); - } catch (Exception e) { - e.printStackTrace(); - } - - /* } - ).start();*/ - - - } - } - - } - - - } - - - //@Scheduled(fixedRate = 1000 * 60 * 35) @Scheduled(fixedRate = 1000 * 60 * 60 * 3) - //暂定2小说,只爬分类前3本书,一共3*7=21本书,爬等以后书籍多了之后,会适当缩短更新间隔 public void crawBquge11BooksAtDay() throws Exception { if (!isExcuting) { isExcuting = true; - final String baseUrl = "https://m.biqudao.com"; log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。"); - -//①爬分类列表的书籍url和总页数 -// https: -////m.biquta.com/class/1/1.html -// https: -////m.biquta.com/class/2/1.html -// https: -////m.biquta.com/class/2/2.html -// -// -// https: -////m.biquta.com/class/2/2.html -// -// -// -// -// -// - //第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小 while (true) { - log.debug("crawlBooksSchedule循环执行中。。。。。。。。。。。。"); - //List classIdList = new ArrayList<>(Arrays.asList(new Integer[]{1,2,3,4,5,6,7})); - // for (int i = 1; i <= 7; i++) { - - // log.debug("crawlBooksSchedule分类"+i+"执行中。。。。。。。。。。。。"); - - // int finalI = i; - /* new Thread( - () -> {*/ try { - //先随机更新分类 - //Random random = new Random(); - //int finalI = classIdList.get(new Random().nextInt(classIdList.size())); - //classIdList.remove(finalI); - int finalI = 0; - //拼接分类URL - int page = 1;//起始页码 - int totalPage = page; - String catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + page + ".html"; - String forObject = getByHttpClient(catBookListUrl); - if (forObject != null) { - //匹配分页数 - Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); - Matcher matcher = pattern.matcher(forObject); - boolean isFind = matcher.find(); - System.out.println("匹配分页数" + isFind); - if (isFind) { - int currentPage = Integer.parseInt(matcher.group(1)); - totalPage = Integer.parseInt(matcher.group(2)); - //解析第一页书籍的数据 - Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\""); - //白天更新 - parseBiquge11Book(bookPatten, forObject, finalI, baseUrl, true); - /* while (currentPage < totalPage) { - catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + (currentPage + 1) + ".html"; - forObject = getByHttpClient(catBookListUrl); - if (forObject != null) { - //匹配分页数 - matcher = pattern.matcher(forObject); - isFind = matcher.find(); - - if (isFind) { - currentPage = Integer.parseInt(matcher.group(1)); - totalPage = Integer.parseInt(matcher.group(2)); - parseBiquge11Book(bookPatten, forObject, finalI, baseUrl); - } - } else { - currentPage++; - } - }*/ + switch (websiteType) { + case 1: { + updateBiqudaoBooks(0); + break; + } + case 2: { + updateBiquTaBooks(0); + break; } } Thread.sleep(1000 * 60 * 5); } catch (Exception e) { e.printStackTrace(); } - - /* } - ).start();*/ - - - // } } + + } + + + } + + private void updateBiquTaBooks(int finalI) { + String baseUrl = "https://m.biquta.com"; + String catBookListUrlBase = baseUrl + "/class/"; + + int page = 1;//起始页码 + int totalPage = page; + String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html"; + String forObject = getByHttpClient(catBookListUrl); + if (forObject != null) { + //匹配分页数 + Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); + Matcher matcher = pattern.matcher(forObject); + boolean isFind = matcher.find(); + System.out.println("匹配分页数" + isFind); + if (isFind) { + int currentPage = Integer.parseInt(matcher.group(1)); + totalPage = Integer.parseInt(matcher.group(2)); + //解析第一页书籍的数据 + Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\""); + parseBiquTaBook(bookPatten, forObject, finalI, baseUrl, true); + } + } + } + + private void parseBiquTaBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) { + Matcher matcher2 = bookPatten.matcher(forObject); + boolean isFind = matcher2.find(); + Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分"); + Matcher scoreMatch = scorePatten.matcher(forObject); + boolean scoreFind = scoreMatch.find(); + + Pattern bookNamePatten = Pattern.compile("

([^/]+)

"); + Matcher bookNameMatch = bookNamePatten.matcher(forObject); + boolean isBookNameMatch = bookNameMatch.find(); + + + System.out.println("匹配书籍url" + isFind); + + System.out.println("匹配分数" + scoreFind); + + while (isFind && scoreFind && isBookNameMatch) { + + try { + Float score = Float.parseFloat(scoreMatch.group(1)); + + if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说 + continue; + } + + String bokNum = matcher2.group(1); + String bookUrl = baseUrl + "/" + bokNum + "/"; + + String body = getByHttpClient(bookUrl); + if (body != null) { + + String bookName = bookNameMatch.group(1); + Pattern authorPatten = Pattern.compile(">作者:([^/]+)<"); + Matcher authoreMatch = authorPatten.matcher(body); + if (authoreMatch.find()) { + String author = authoreMatch.group(1); + + Pattern statusPatten = Pattern.compile("状态:([^/]+)"); + Matcher statusMatch = statusPatten.matcher(body); + if (statusMatch.find()) { + String status = statusMatch.group(1); + + Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); + Matcher updateTimeMatch = updateTimePatten.matcher(body); + if (updateTimeMatch.find()) { + String updateTimeStr = updateTimeMatch.group(1); + SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); + Date updateTime = format.parse(updateTimeStr); + Pattern picPatten = Pattern.compile("]+)\"\\s+onerror=\"this.src="); + Matcher picMather = picPatten.matcher(body); + if (picMather.find()) { + String picSrc = picMather.group(1); + + Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)

"); + Matcher descMatch = descPatten.matcher(body); + if (descMatch.find()) { + String desc = descMatch.group(1); + + + Book book = new Book(); + book.setAuthor(author); + book.setCatid(catNum); + book.setBookDesc(desc); + book.setBookName(bookName); + book.setScore(score > 10 ? 8.0f : score); + book.setPicUrl(picSrc); + book.setBookStatus(status); + book.setUpdateTime(updateTime); + + List indexList = new ArrayList<>(); + List contentList = new ArrayList<>(); + + //读取目录 + Pattern indexPatten = Pattern.compile("查看完整目录"); + Matcher indexMatch = indexPatten.matcher(body); + if (indexMatch.find()) { + String indexUrl = baseUrl + indexMatch.group(1); + String body2 = getByHttpClient(indexUrl); + if (body2 != null) { + Pattern indexListPatten = Pattern.compile("([^/]+)"); + Matcher indexListMatch = indexListPatten.matcher(body2); + + boolean isFindIndex = indexListMatch.find(); + + int indexNum = 0; + + //查询该书籍已存在目录号 + List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author); + //更新和插入分别开,插入只在凌晨做一次 + if ((isUpdate && hasIndexNum.size() > 0) || (!isUpdate && hasIndexNum.size() == 0)) { + while (isFindIndex) { + if (!hasIndexNum.contains(indexNum)) { + + String contentUrl = baseUrl + indexListMatch.group(1); + String indexName = indexListMatch.group(2); + + + //查询章节内容 + String body3 = getByHttpClient(contentUrl); + if (body3 != null) { + Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读"); + String start = "『章节错误,点此举报』"; + String end = "『加入书签,方便阅读』"; + String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end)); + //TODO插入章节目录和章节内容 + BookIndex bookIndex = new BookIndex(); + bookIndex.setIndexName(indexName); + bookIndex.setIndexNum(indexNum); + indexList.add(bookIndex); + BookContent bookContent = new BookContent(); + bookContent.setContent(content); + bookContent.setIndexNum(indexNum); + contentList.add(bookContent); + + + } else { + break; + } + + + } + indexNum++; + isFindIndex = indexListMatch.find(); + } + + if (indexList.size() == contentList.size() && indexList.size() > 0) { + ExcutorUtils.excuteFixedTask(new Runnable() { + @Override + public void run() { + bookService.saveBookAndIndexAndContent(book, indexList, contentList); + } + }); + + } + } + } + + + } + + + } + + + } + } + } + + + } + + } + + + } catch (Exception e) { + + e.printStackTrace(); + + } finally { + matcher2.find(); + isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配 + scoreFind = scoreMatch.find(); + isBookNameMatch = bookNameMatch.find(); + } + + + } + } + + private void updateBiqudaoBooks(int finalI) { + String baseUrl = "https://m.biqudao.com"; + String catBookListUrlBase = baseUrl + "/bqgeclass/"; + + int page = 1;//起始页码 + int totalPage = page; + String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html"; + String forObject = getByHttpClient(catBookListUrl); + if (forObject != null) { + //匹配分页数 + Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); + Matcher matcher = pattern.matcher(forObject); + boolean isFind = matcher.find(); + System.out.println("匹配分页数" + isFind); + if (isFind) { + int currentPage = Integer.parseInt(matcher.group(1)); + totalPage = Integer.parseInt(matcher.group(2)); + //解析第一页书籍的数据 + Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\""); + //白天更新 + parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, true); + } } @@ -431,7 +362,7 @@ public class CrawlBooksSchedule { //解析第一页书籍的数据 Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\""); //晚上插入 - parseBiquge11Book(bookPatten, forObject, finalI, baseUrl, false); + parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false); while (currentPage < totalPage) { if (new Date().getHours() > 5) { break; @@ -446,7 +377,7 @@ public class CrawlBooksSchedule { if (isFind) { currentPage = Integer.parseInt(matcher.group(1)); totalPage = Integer.parseInt(matcher.group(2)); - parseBiquge11Book(bookPatten, forObject, finalI, baseUrl, false); + parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false); } } else { currentPage++; @@ -467,7 +398,7 @@ public class CrawlBooksSchedule { } - private void parseBiquge11Book(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) { + private void parseBiquDaoBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) { Matcher matcher2 = bookPatten.matcher(forObject); boolean isFind = matcher2.find(); @@ -570,7 +501,7 @@ public class CrawlBooksSchedule { book.setCatid(catNum); book.setBookDesc(desc); book.setBookName(bookName); - book.setScore(score>10?8.0f:score); + book.setScore(score > 10 ? 8.0f : score); book.setPicUrl(picSrc); book.setBookStatus(status); book.setUpdateTime(updateTime); @@ -769,7 +700,7 @@ public class CrawlBooksSchedule { book.setCatid(catNum); book.setBookDesc(desc); book.setBookName(bookName); - book.setScore(score>10?8.0f:score); + book.setScore(score > 10 ? 8.0f : score); book.setPicUrl(picSrc); book.setBookStatus(status); book.setUpdateTime(updateTime); diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 848e56c..9a41764 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -64,8 +64,12 @@ mysql: #爬取小说数据的最低评分 books: - lowestScore: 8.0 + lowestScore: 6.0 +#爬取的网站名称类型 1:笔趣岛 ,2:笔趣塔 更多网站解析中,敬请期待 +crawl: + website: + type: 1 search: schedule: