package xyz.zinglizingli.common.schedule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import org.springframework.web.client.RestTemplate; import xyz.zinglizingli.books.po.Book; import xyz.zinglizingli.books.po.BookContent; import xyz.zinglizingli.books.po.BookIndex; import xyz.zinglizingli.books.service.BookService; import xyz.zinglizingli.books.util.ExcutorUtils; import xyz.zinglizingli.common.utils.RestTemplateUtil; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @Service public class CrawlBooksSchedule { private Logger log = LoggerFactory.getLogger(CrawlBooksSchedule.class); @Autowired private BookService bookService; RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); @Value("${books.lowestScore}") private Float lowestScore; @Value("${crawl.website.type}") private Byte websiteType; private boolean isExcuting = false; @Scheduled(fixedRate = 1000 * 60 * 60 * 3) public void crawBquge11BooksAtDay() throws Exception { if (!isExcuting) { isExcuting = true; log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。"); while (true) { try { switch (websiteType) { case 1: { updateBiqudaoBooks(0); break; } case 2: { updateBiquTaBooks(0); break; } } Thread.sleep(1000 * 60 * 5); } catch (Exception e) { e.printStackTrace(); } } } } private void updateBiquTaBooks(int finalI) { String baseUrl = "https://m.biquta.com"; String catBookListUrlBase = baseUrl + "/class/"; int page = 1;//起始页码 int totalPage = page; String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html"; String forObject = getByHttpClient(catBookListUrl); if (forObject != null) { //匹配分页数 Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); Matcher matcher = pattern.matcher(forObject); boolean isFind = matcher.find(); System.out.println("匹配分页数" + isFind); if (isFind) { int currentPage = Integer.parseInt(matcher.group(1)); totalPage = Integer.parseInt(matcher.group(2)); //解析第一页书籍的数据 Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\""); parseBiquTaBook(bookPatten, forObject, finalI, baseUrl, true); } } } private void parseBiquTaBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) { Matcher matcher2 = bookPatten.matcher(forObject); boolean isFind = matcher2.find(); Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分"); Matcher scoreMatch = scorePatten.matcher(forObject); boolean scoreFind = scoreMatch.find(); Pattern bookNamePatten = Pattern.compile("

([^/]+)

"); Matcher bookNameMatch = bookNamePatten.matcher(forObject); boolean isBookNameMatch = bookNameMatch.find(); System.out.println("匹配书籍url" + isFind); System.out.println("匹配分数" + scoreFind); while (isFind && scoreFind && isBookNameMatch) { try { Float score = Float.parseFloat(scoreMatch.group(1)); if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说 continue; } String bokNum = matcher2.group(1); String bookUrl = baseUrl + "/" + bokNum + "/"; String body = getByHttpClient(bookUrl); if (body != null) { String bookName = bookNameMatch.group(1); Pattern authorPatten = Pattern.compile(">作者:([^/]+)<"); Matcher authoreMatch = authorPatten.matcher(body); if (authoreMatch.find()) { String author = authoreMatch.group(1); Pattern statusPatten = Pattern.compile("状态:([^/]+)"); Matcher statusMatch = statusPatten.matcher(body); if (statusMatch.find()) { String status = statusMatch.group(1); Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); Matcher updateTimeMatch = updateTimePatten.matcher(body); if (updateTimeMatch.find()) { String updateTimeStr = updateTimeMatch.group(1); SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); Date updateTime = format.parse(updateTimeStr); Pattern picPatten = Pattern.compile("]+)\"\\s+onerror=\"this.src="); Matcher picMather = picPatten.matcher(body); if (picMather.find()) { String picSrc = picMather.group(1); Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)

"); Matcher descMatch = descPatten.matcher(body); if (descMatch.find()) { String desc = descMatch.group(1); Book book = new Book(); book.setAuthor(author); book.setCatid(catNum); book.setBookDesc(desc); book.setBookName(bookName); book.setScore(score > 10 ? 8.0f : score); book.setPicUrl(picSrc); book.setBookStatus(status); book.setUpdateTime(updateTime); List indexList = new ArrayList<>(); List contentList = new ArrayList<>(); //读取目录 Pattern indexPatten = Pattern.compile("查看完整目录"); Matcher indexMatch = indexPatten.matcher(body); if (indexMatch.find()) { String indexUrl = baseUrl + indexMatch.group(1); String body2 = getByHttpClient(indexUrl); if (body2 != null) { Pattern indexListPatten = Pattern.compile("([^/]+)"); Matcher indexListMatch = indexListPatten.matcher(body2); boolean isFindIndex = indexListMatch.find(); int indexNum = 0; //查询该书籍已存在目录号 List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author); //更新和插入分别开,插入只在凌晨做一次 if ((isUpdate && hasIndexNum.size() > 0) || (!isUpdate && hasIndexNum.size() == 0)) { while (isFindIndex) { if (!hasIndexNum.contains(indexNum)) { String contentUrl = baseUrl + indexListMatch.group(1); String indexName = indexListMatch.group(2); //查询章节内容 String body3 = getByHttpClient(contentUrl); if (body3 != null) { Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读"); String start = "『章节错误,点此举报』"; String end = "『加入书签,方便阅读』"; String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end)); //TODO插入章节目录和章节内容 BookIndex bookIndex = new BookIndex(); bookIndex.setIndexName(indexName); bookIndex.setIndexNum(indexNum); indexList.add(bookIndex); BookContent bookContent = new BookContent(); bookContent.setContent(content); bookContent.setIndexNum(indexNum); contentList.add(bookContent); } else { break; } } indexNum++; isFindIndex = indexListMatch.find(); } if (indexList.size() == contentList.size() && indexList.size() > 0) { ExcutorUtils.excuteFixedTask(new Runnable() { @Override public void run() { bookService.saveBookAndIndexAndContent(book, indexList, contentList); } }); } } } } } } } } } } } catch (Exception e) { e.printStackTrace(); } finally { matcher2.find(); isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配 scoreFind = scoreMatch.find(); isBookNameMatch = bookNameMatch.find(); } } } private void updateBiqudaoBooks(int finalI) { String baseUrl = "https://m.biqudao.com"; String catBookListUrlBase = baseUrl + "/bqgeclass/"; int page = 1;//起始页码 int totalPage = page; String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html"; String forObject = getByHttpClient(catBookListUrl); if (forObject != null) { //匹配分页数 Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); Matcher matcher = pattern.matcher(forObject); boolean isFind = matcher.find(); System.out.println("匹配分页数" + isFind); if (isFind) { int currentPage = Integer.parseInt(matcher.group(1)); totalPage = Integer.parseInt(matcher.group(2)); //解析第一页书籍的数据 Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\""); //白天更新 parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, true); } } } //@Scheduled(cron = "0 0 2 * * ?")磁盘空间不足,暂时不抓新书 //暂定2小说,只爬分类前3本书,一共3*7=21本书,爬等以后书籍多了之后,会适当缩短更新间隔 public void crawBquge11BooksAtNight() throws Exception { final String baseUrl = "https://m.biqudao.com"; log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。"); //①爬分类列表的书籍url和总页数 // https: ////m.biquta.com/class/1/1.html // https: ////m.biquta.com/class/2/1.html // https: ////m.biquta.com/class/2/2.html // // // https: ////m.biquta.com/class/2/2.html // // // // // // //第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小 log.debug("crawlBooksSchedule循环执行中。。。。。。。。。。。。"); //List classIdList = new ArrayList<>(Arrays.asList(new Integer[]{1,2,3,4,5,6,7})); // for (int i = 1; i <= 7; i++) { // log.debug("crawlBooksSchedule分类"+i+"执行中。。。。。。。。。。。。"); // int finalI = i; /* new Thread( () -> {*/ try { //先随机更新分类 //Random random = new Random(); //int finalI = classIdList.get(new Random().nextInt(classIdList.size())); //classIdList.remove(finalI); int finalI = 0; //拼接分类URL int page = 1;//起始页码 int totalPage = page; String catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + page + ".html"; String forObject = getByHttpClient(catBookListUrl); if (forObject != null) { //匹配分页数 Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); Matcher matcher = pattern.matcher(forObject); boolean isFind = matcher.find(); System.out.println("匹配分页数" + isFind); if (isFind) { int currentPage = Integer.parseInt(matcher.group(1)); totalPage = Integer.parseInt(matcher.group(2)); //解析第一页书籍的数据 Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\""); //晚上插入 parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false); while (currentPage < totalPage) { if (new Date().getHours() > 5) { break; } catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + (currentPage + 1) + ".html"; forObject = getByHttpClient(catBookListUrl); if (forObject != null) { //匹配分页数 matcher = pattern.matcher(forObject); isFind = matcher.find(); if (isFind) { currentPage = Integer.parseInt(matcher.group(1)); totalPage = Integer.parseInt(matcher.group(2)); parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false); } } else { currentPage++; } } } } } catch (Exception e) { e.printStackTrace(); } /* } ).start();*/ // } } private void parseBiquDaoBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) { Matcher matcher2 = bookPatten.matcher(forObject); boolean isFind = matcher2.find(); Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分"); Matcher scoreMatch = scorePatten.matcher(forObject); boolean scoreFind = scoreMatch.find(); Pattern bookNamePatten = Pattern.compile("

([^/]+)

"); Matcher bookNameMatch = bookNamePatten.matcher(forObject); boolean isBookNameMatch = bookNameMatch.find(); System.out.println("匹配书籍url" + isFind); System.out.println("匹配分数" + scoreFind); while (isFind && scoreFind && isBookNameMatch) { try { Float score = Float.parseFloat(scoreMatch.group(1)); if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说 continue; } String bokNum = matcher2.group(1); String bookUrl = baseUrl + "/" + bokNum + "/"; String body = getByHttpClient(bookUrl); if (body != null) { String bookName = bookNameMatch.group(1); Pattern authorPatten = Pattern.compile("
  • 作者:([^/]+)
  • "); Matcher authoreMatch = authorPatten.matcher(body); if (authoreMatch.find()) { String author = authoreMatch.group(1); Pattern statusPatten = Pattern.compile("状态:([^/]+)"); Matcher statusMatch = statusPatten.matcher(body); if (statusMatch.find()) { String status = statusMatch.group(1); Pattern catPatten = Pattern.compile("类别:([^/]+)"); Matcher catMatch = catPatten.matcher(body); if (catMatch.find()) { String catName = catMatch.group(1); switch (catName) { case "玄幻奇幻": { catNum = 1; break; } case "武侠仙侠": { catNum = 2; break; } case "都市言情": { catNum = 3; break; } case "历史军事": { catNum = 4; break; } case "科幻灵异": { catNum = 5; break; } case "网游竞技": { catNum = 6; break; } case "女生频道": { catNum = 7; break; } default: { catNum = 1; break; } } Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); Matcher updateTimeMatch = updateTimePatten.matcher(body); if (updateTimeMatch.find()) { String updateTimeStr = updateTimeMatch.group(1); SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); Date updateTime = format.parse(updateTimeStr); Pattern picPatten = Pattern.compile("]+)\"\\s+onerror=\"this.src="); Matcher picMather = picPatten.matcher(body); if (picMather.find()) { String picSrc = picMather.group(1); Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)

    "); Matcher descMatch = descPatten.matcher(body); if (descMatch.find()) { String desc = descMatch.group(1); Book book = new Book(); book.setAuthor(author); book.setCatid(catNum); book.setBookDesc(desc); book.setBookName(bookName); book.setScore(score > 10 ? 8.0f : score); book.setPicUrl(picSrc); book.setBookStatus(status); book.setUpdateTime(updateTime); List indexList = new ArrayList<>(); List contentList = new ArrayList<>(); //读取目录 Pattern indexPatten = Pattern.compile("查看完整目录"); Matcher indexMatch = indexPatten.matcher(body); if (indexMatch.find()) { String indexUrl = baseUrl + indexMatch.group(1); String body2 = getByHttpClient(indexUrl); if (body2 != null) { Pattern indexListPatten = Pattern.compile("([^/]+)"); Matcher indexListMatch = indexListPatten.matcher(body2); boolean isFindIndex = indexListMatch.find(); int indexNum = 0; //查询该书籍已存在目录号 List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author); //更新和插入分别开,插入只在凌晨做一次 if ((isUpdate && hasIndexNum.size() > 0) || (!isUpdate && hasIndexNum.size() == 0)) { while (isFindIndex) { if (!hasIndexNum.contains(indexNum)) { String contentUrl = baseUrl + indexListMatch.group(1); String indexName = indexListMatch.group(2); //查询章节内容 String body3 = getByHttpClient(contentUrl); if (body3 != null) { Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读"); String start = "『章节错误,点此举报』"; String end = "『加入书签,方便阅读』"; String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end)); //TODO插入章节目录和章节内容 BookIndex bookIndex = new BookIndex(); bookIndex.setIndexName(indexName); bookIndex.setIndexNum(indexNum); indexList.add(bookIndex); BookContent bookContent = new BookContent(); bookContent.setContent(content); bookContent.setIndexNum(indexNum); contentList.add(bookContent); } else { break; } } indexNum++; isFindIndex = indexListMatch.find(); } if (indexList.size() == contentList.size() && indexList.size() > 0) { ExcutorUtils.excuteFixedTask(new Runnable() { @Override public void run() { bookService.saveBookAndIndexAndContent(book, indexList, contentList); } }); } } } } } } } } } } } } catch (Exception e) { e.printStackTrace(); } finally { matcher2.find(); isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配 scoreFind = scoreMatch.find(); isBookNameMatch = bookNameMatch.find(); } } } private String getByHttpClient(String catBookListUrl) { try { /* HttpClient httpClient = new DefaultHttpClient(); HttpGet getReq = new HttpGet(catBookListUrl); getReq.setHeader("user-agent", "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1"); HttpResponse execute = httpClient.execute(getReq); if (execute.getStatusLine().getStatusCode() == HttpStatus.OK.value()) { HttpEntity entity = execute.getEntity(); return EntityUtils.toString(entity, "utf-8"); } else { return null; }*/ //经测试restTemplate比httpClient效率高出很多倍,所有选择restTemplate ResponseEntity forEntity = restTemplate.getForEntity(catBookListUrl, String.class); if (forEntity.getStatusCode() == HttpStatus.OK) { return forEntity.getBody(); } else { return null; } } catch (Exception e) { e.printStackTrace(); return null; } } /*** * 解析书籍详情之后的页面 */ private void parseBook(Pattern bookPatten, String forObject, RestTemplate restTemplate, int catNum, String baseUrl) throws ParseException { Matcher matcher2 = bookPatten.matcher(forObject); boolean isFind = matcher2.find(); Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分"); Matcher scoreMatch = scorePatten.matcher(forObject); boolean scoreFind = scoreMatch.find(); Pattern bookNamePatten = Pattern.compile("

    ([^/]+)

    "); Matcher bookNameMatch = bookNamePatten.matcher(forObject); boolean isBookNameMatch = bookNameMatch.find(); Pattern authorPatten = Pattern.compile(">作者:([^/]+)<"); Matcher authoreMatch = authorPatten.matcher(forObject); boolean isFindAuthor = authoreMatch.find(); System.out.println("匹配书籍url" + isFind); System.out.println("匹配分数" + scoreFind); while (isFind && scoreFind && isBookNameMatch && isFindAuthor) { try { Float score = Float.parseFloat(scoreMatch.group(1)); if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说 continue; } String bookName = bookNameMatch.group(1); String author = authoreMatch.group(1); String bokNum = matcher2.group(1); String bookUrl = baseUrl + "/" + bokNum + "/"; ResponseEntity forEntity = restTemplate.getForEntity(bookUrl, String.class); if (forEntity.getStatusCode() == HttpStatus.OK) { String body = forEntity.getBody(); Pattern statusPatten = Pattern.compile("状态:([^/]+)"); Matcher statusMatch = statusPatten.matcher(body); if (statusMatch.find()) { String status = statusMatch.group(1); Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); Matcher updateTimeMatch = updateTimePatten.matcher(body); if (updateTimeMatch.find()) { String updateTimeStr = updateTimeMatch.group(1); SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); Date updateTime = format.parse(updateTimeStr); Pattern picPatten = Pattern.compile("]+)\"\\s+onerror=\"this.src="); Matcher picMather = picPatten.matcher(body); if (picMather.find()) { String picSrc = picMather.group(1); Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)

    "); Matcher descMatch = descPatten.matcher(body); if (descMatch.find()) { String desc = descMatch.group(1); Book book = new Book(); book.setAuthor(author); book.setCatid(catNum); book.setBookDesc(desc); book.setBookName(bookName); book.setScore(score > 10 ? 8.0f : score); book.setPicUrl(picSrc); book.setBookStatus(status); book.setUpdateTime(updateTime); List indexList = new ArrayList<>(); List contentList = new ArrayList<>(); //读取目录 Pattern indexPatten = Pattern.compile("查看完整目录"); Matcher indexMatch = indexPatten.matcher(body); if (indexMatch.find()) { String indexUrl = baseUrl + indexMatch.group(1); ResponseEntity forEntity1 = restTemplate.getForEntity(indexUrl, String.class); if (forEntity1.getStatusCode() == HttpStatus.OK) { String body2 = forEntity1.getBody(); Pattern indexListPatten = Pattern.compile("([^/]+)"); Matcher indexListMatch = indexListPatten.matcher(body2); boolean isFindIndex = indexListMatch.find(); int indexNum = 0; //查询该书籍已存在目录号 List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author); while (isFindIndex) { if (!hasIndexNum.contains(indexNum)) { String contentUrl = baseUrl + indexListMatch.group(1); String indexName = indexListMatch.group(2); //查询章节内容 ResponseEntity forEntity2 = restTemplate.getForEntity(contentUrl, String.class); if (forEntity2.getStatusCode() == HttpStatus.OK) { String body3 = forEntity2.getBody(); Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读"); String start = "『章节错误,点此举报』"; String end = "『加入书签,方便阅读』"; String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end)); //TODO插入章节目录和章节内容 BookIndex bookIndex = new BookIndex(); bookIndex.setIndexName(indexName); bookIndex.setIndexNum(indexNum); indexList.add(bookIndex); BookContent bookContent = new BookContent(); bookContent.setContent(content); bookContent.setIndexNum(indexNum); contentList.add(bookContent); } else { break; } } indexNum++; isFindIndex = indexListMatch.find(); } if (indexList.size() == contentList.size() && indexList.size() > 0) { bookService.saveBookAndIndexAndContent(book, indexList, contentList); } } } } } } } } } catch (Exception e) { e.printStackTrace(); } finally { matcher2.find(); isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配 scoreFind = scoreMatch.find(); isBookNameMatch = bookNameMatch.find(); isFindAuthor = authoreMatch.find(); } } } }