package xyz.zinglizingli.common.schedule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestTemplate;
import xyz.zinglizingli.books.po.Book;
import xyz.zinglizingli.books.po.BookContent;
import xyz.zinglizingli.books.po.BookIndex;
import xyz.zinglizingli.books.service.BookService;
import xyz.zinglizingli.books.util.ExcutorUtils;
import xyz.zinglizingli.common.utils.RestTemplateUtil;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class CrawlBooksSchedule {
private Logger log = LoggerFactory.getLogger(CrawlBooksSchedule.class);
@Autowired
private BookService bookService;
RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
@Value("${books.lowestScore}")
private Float lowestScore;
@Value("${crawl.website.type}")
private Byte websiteType;
private boolean isExcuting = false;
@Scheduled(fixedRate = 1000 * 60 * 60 * 3)
public void crawBquge11BooksAtDay() throws Exception {
if (!isExcuting) {
isExcuting = true;
log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。");
while (true) {
try {
switch (websiteType) {
case 1: {
updateBiqudaoBooks(0);
break;
}
case 2: {
updateBiquTaBooks(0);
break;
}
}
Thread.sleep(1000 * 60 * 5);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
private void updateBiquTaBooks(int finalI) {
String baseUrl = "https://m.biquta.com";
String catBookListUrlBase = baseUrl + "/class/";
int page = 1;//起始页码
int totalPage = page;
String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html";
String forObject = getByHttpClient(catBookListUrl);
if (forObject != null) {
//匹配分页数
Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
Matcher matcher = pattern.matcher(forObject);
boolean isFind = matcher.find();
System.out.println("匹配分页数" + isFind);
if (isFind) {
int currentPage = Integer.parseInt(matcher.group(1));
totalPage = Integer.parseInt(matcher.group(2));
//解析第一页书籍的数据
Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\"");
parseBiquTaBook(bookPatten, forObject, finalI, baseUrl, true);
}
}
}
private void parseBiquTaBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) {
Matcher matcher2 = bookPatten.matcher(forObject);
boolean isFind = matcher2.find();
Pattern scorePatten = Pattern.compile("
(\\d+\\.\\d+)分
");
Matcher scoreMatch = scorePatten.matcher(forObject);
boolean scoreFind = scoreMatch.find();
Pattern bookNamePatten = Pattern.compile("([^/]+)
");
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
boolean isBookNameMatch = bookNameMatch.find();
System.out.println("匹配书籍url" + isFind);
System.out.println("匹配分数" + scoreFind);
while (isFind && scoreFind && isBookNameMatch) {
try {
Float score = Float.parseFloat(scoreMatch.group(1));
if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说
continue;
}
String bokNum = matcher2.group(1);
String bookUrl = baseUrl + "/" + bokNum + "/";
String body = getByHttpClient(bookUrl);
if (body != null) {
String bookName = bookNameMatch.group(1);
Pattern authorPatten = Pattern.compile(">作者:([^/]+)<");
Matcher authoreMatch = authorPatten.matcher(body);
if (authoreMatch.find()) {
String author = authoreMatch.group(1);
Pattern statusPatten = Pattern.compile("状态:([^/]+)");
Matcher statusMatch = statusPatten.matcher(body);
if (statusMatch.find()) {
String status = statusMatch.group(1);
Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)");
Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
Pattern picPatten = Pattern.compile("
]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
String picSrc = picMather.group(1);
Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)");
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List indexList = new ArrayList<>();
List contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = Pattern.compile("查看完整目录");
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = baseUrl + indexMatch.group(1);
String body2 = getByHttpClient(indexUrl);
if (body2 != null) {
Pattern indexListPatten = Pattern.compile("([^/]+)");
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author);
//更新和插入分别开,插入只在凌晨做一次
if ((isUpdate && hasIndexNum.size() > 0) || (!isUpdate && hasIndexNum.size() == 0)) {
while (isFindIndex) {
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = baseUrl + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = getByHttpClient(contentUrl);
if (body3 != null) {
Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读");
String start = "『章节错误,点此举报』";
String end = "『加入书签,方便阅读』";
String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end));
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
ExcutorUtils.excuteFixedTask(new Runnable() {
@Override
public void run() {
bookService.saveBookAndIndexAndContent(book, indexList, contentList);
}
});
}
}
}
}
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
matcher2.find();
isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配
scoreFind = scoreMatch.find();
isBookNameMatch = bookNameMatch.find();
}
}
}
private void updateBiqudaoBooks(int finalI) {
String baseUrl = "https://m.biqudao.com";
String catBookListUrlBase = baseUrl + "/bqgeclass/";
int page = 1;//起始页码
int totalPage = page;
String catBookListUrl = catBookListUrlBase + finalI + "/" + page + ".html";
String forObject = getByHttpClient(catBookListUrl);
if (forObject != null) {
//匹配分页数
Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
Matcher matcher = pattern.matcher(forObject);
boolean isFind = matcher.find();
System.out.println("匹配分页数" + isFind);
if (isFind) {
int currentPage = Integer.parseInt(matcher.group(1));
totalPage = Integer.parseInt(matcher.group(2));
//解析第一页书籍的数据
Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\"");
//白天更新
parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, true);
}
}
}
//@Scheduled(cron = "0 0 2 * * ?")磁盘空间不足,暂时不抓新书
//暂定2小说,只爬分类前3本书,一共3*7=21本书,爬等以后书籍多了之后,会适当缩短更新间隔
public void crawBquge11BooksAtNight() throws Exception {
final String baseUrl = "https://m.biqudao.com";
log.debug("crawlBooksSchedule执行中。。。。。。。。。。。。");
//①爬分类列表的书籍url和总页数
// https:
////m.biquta.com/class/1/1.html
// https:
////m.biquta.com/class/2/1.html
// https:
////m.biquta.com/class/2/2.html
//
//
// https:
////m.biquta.com/class/2/2.html
//
//
//
//
//
//
//第一周期全部书拉取完后,可进行第二周期,只拉取前面几页的数据,拉取时间间隔变小
log.debug("crawlBooksSchedule循环执行中。。。。。。。。。。。。");
//List classIdList = new ArrayList<>(Arrays.asList(new Integer[]{1,2,3,4,5,6,7}));
// for (int i = 1; i <= 7; i++) {
// log.debug("crawlBooksSchedule分类"+i+"执行中。。。。。。。。。。。。");
// int finalI = i;
/* new Thread(
() -> {*/
try {
//先随机更新分类
//Random random = new Random();
//int finalI = classIdList.get(new Random().nextInt(classIdList.size()));
//classIdList.remove(finalI);
int finalI = 0;
//拼接分类URL
int page = 1;//起始页码
int totalPage = page;
String catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + page + ".html";
String forObject = getByHttpClient(catBookListUrl);
if (forObject != null) {
//匹配分页数
Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\"");
Matcher matcher = pattern.matcher(forObject);
boolean isFind = matcher.find();
System.out.println("匹配分页数" + isFind);
if (isFind) {
int currentPage = Integer.parseInt(matcher.group(1));
totalPage = Integer.parseInt(matcher.group(2));
//解析第一页书籍的数据
Pattern bookPatten = Pattern.compile("href=\"/(bqge\\d+)/\"");
//晚上插入
parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false);
while (currentPage < totalPage) {
if (new Date().getHours() > 5) {
break;
}
catBookListUrl = baseUrl + "/bqgeclass/" + finalI + "/" + (currentPage + 1) + ".html";
forObject = getByHttpClient(catBookListUrl);
if (forObject != null) {
//匹配分页数
matcher = pattern.matcher(forObject);
isFind = matcher.find();
if (isFind) {
currentPage = Integer.parseInt(matcher.group(1));
totalPage = Integer.parseInt(matcher.group(2));
parseBiquDaoBook(bookPatten, forObject, finalI, baseUrl, false);
}
} else {
currentPage++;
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
/* }
).start();*/
// }
}
private void parseBiquDaoBook(Pattern bookPatten, String forObject, int catNum, String baseUrl, boolean isUpdate) {
Matcher matcher2 = bookPatten.matcher(forObject);
boolean isFind = matcher2.find();
Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分
");
Matcher scoreMatch = scorePatten.matcher(forObject);
boolean scoreFind = scoreMatch.find();
Pattern bookNamePatten = Pattern.compile("([^/]+)
");
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
boolean isBookNameMatch = bookNameMatch.find();
System.out.println("匹配书籍url" + isFind);
System.out.println("匹配分数" + scoreFind);
while (isFind && scoreFind && isBookNameMatch) {
try {
Float score = Float.parseFloat(scoreMatch.group(1));
if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说
continue;
}
String bokNum = matcher2.group(1);
String bookUrl = baseUrl + "/" + bokNum + "/";
String body = getByHttpClient(bookUrl);
if (body != null) {
String bookName = bookNameMatch.group(1);
Pattern authorPatten = Pattern.compile("作者:([^/]+)");
Matcher authoreMatch = authorPatten.matcher(body);
if (authoreMatch.find()) {
String author = authoreMatch.group(1);
Pattern statusPatten = Pattern.compile("状态:([^/]+)");
Matcher statusMatch = statusPatten.matcher(body);
if (statusMatch.find()) {
String status = statusMatch.group(1);
Pattern catPatten = Pattern.compile("类别:([^/]+)");
Matcher catMatch = catPatten.matcher(body);
if (catMatch.find()) {
String catName = catMatch.group(1);
switch (catName) {
case "玄幻奇幻": {
catNum = 1;
break;
}
case "武侠仙侠": {
catNum = 2;
break;
}
case "都市言情": {
catNum = 3;
break;
}
case "历史军事": {
catNum = 4;
break;
}
case "科幻灵异": {
catNum = 5;
break;
}
case "网游竞技": {
catNum = 6;
break;
}
case "女生频道": {
catNum = 7;
break;
}
default: {
catNum = 1;
break;
}
}
Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)");
Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
Pattern picPatten = Pattern.compile("
]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
String picSrc = picMather.group(1);
Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)");
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List indexList = new ArrayList<>();
List contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = Pattern.compile("查看完整目录");
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = baseUrl + indexMatch.group(1);
String body2 = getByHttpClient(indexUrl);
if (body2 != null) {
Pattern indexListPatten = Pattern.compile("([^/]+)");
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author);
//更新和插入分别开,插入只在凌晨做一次
if ((isUpdate && hasIndexNum.size() > 0) || (!isUpdate && hasIndexNum.size() == 0)) {
while (isFindIndex) {
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = baseUrl + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
String body3 = getByHttpClient(contentUrl);
if (body3 != null) {
Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读");
String start = "『章节错误,点此举报』";
String end = "『加入书签,方便阅读』";
String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end));
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
ExcutorUtils.excuteFixedTask(new Runnable() {
@Override
public void run() {
bookService.saveBookAndIndexAndContent(book, indexList, contentList);
}
});
}
}
}
}
}
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
matcher2.find();
isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配
scoreFind = scoreMatch.find();
isBookNameMatch = bookNameMatch.find();
}
}
}
private String getByHttpClient(String catBookListUrl) {
try {
/* HttpClient httpClient = new DefaultHttpClient();
HttpGet getReq = new HttpGet(catBookListUrl);
getReq.setHeader("user-agent", "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1");
HttpResponse execute = httpClient.execute(getReq);
if (execute.getStatusLine().getStatusCode() == HttpStatus.OK.value()) {
HttpEntity entity = execute.getEntity();
return EntityUtils.toString(entity, "utf-8");
} else {
return null;
}*/
//经测试restTemplate比httpClient效率高出很多倍,所有选择restTemplate
ResponseEntity forEntity = restTemplate.getForEntity(catBookListUrl, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
} else {
return null;
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/***
* 解析书籍详情之后的页面
*/
private void parseBook(Pattern bookPatten, String forObject, RestTemplate restTemplate, int catNum, String baseUrl) throws ParseException {
Matcher matcher2 = bookPatten.matcher(forObject);
boolean isFind = matcher2.find();
Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分
");
Matcher scoreMatch = scorePatten.matcher(forObject);
boolean scoreFind = scoreMatch.find();
Pattern bookNamePatten = Pattern.compile("([^/]+)
");
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
boolean isBookNameMatch = bookNameMatch.find();
Pattern authorPatten = Pattern.compile(">作者:([^/]+)<");
Matcher authoreMatch = authorPatten.matcher(forObject);
boolean isFindAuthor = authoreMatch.find();
System.out.println("匹配书籍url" + isFind);
System.out.println("匹配分数" + scoreFind);
while (isFind && scoreFind && isBookNameMatch && isFindAuthor) {
try {
Float score = Float.parseFloat(scoreMatch.group(1));
if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说
continue;
}
String bookName = bookNameMatch.group(1);
String author = authoreMatch.group(1);
String bokNum = matcher2.group(1);
String bookUrl = baseUrl + "/" + bokNum + "/";
ResponseEntity forEntity = restTemplate.getForEntity(bookUrl, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
String body = forEntity.getBody();
Pattern statusPatten = Pattern.compile("状态:([^/]+)");
Matcher statusMatch = statusPatten.matcher(body);
if (statusMatch.find()) {
String status = statusMatch.group(1);
Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)");
Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
Pattern picPatten = Pattern.compile("
]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
String picSrc = picMather.group(1);
Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)");
Matcher descMatch = descPatten.matcher(body);
if (descMatch.find()) {
String desc = descMatch.group(1);
Book book = new Book();
book.setAuthor(author);
book.setCatid(catNum);
book.setBookDesc(desc);
book.setBookName(bookName);
book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc);
book.setBookStatus(status);
book.setUpdateTime(updateTime);
List indexList = new ArrayList<>();
List contentList = new ArrayList<>();
//读取目录
Pattern indexPatten = Pattern.compile("查看完整目录");
Matcher indexMatch = indexPatten.matcher(body);
if (indexMatch.find()) {
String indexUrl = baseUrl + indexMatch.group(1);
ResponseEntity forEntity1 = restTemplate.getForEntity(indexUrl, String.class);
if (forEntity1.getStatusCode() == HttpStatus.OK) {
String body2 = forEntity1.getBody();
Pattern indexListPatten = Pattern.compile("([^/]+)");
Matcher indexListMatch = indexListPatten.matcher(body2);
boolean isFindIndex = indexListMatch.find();
int indexNum = 0;
//查询该书籍已存在目录号
List hasIndexNum = bookService.queryIndexCountByBookNameAndBAuthor(bookName, author);
while (isFindIndex) {
if (!hasIndexNum.contains(indexNum)) {
String contentUrl = baseUrl + indexListMatch.group(1);
String indexName = indexListMatch.group(2);
//查询章节内容
ResponseEntity forEntity2 = restTemplate.getForEntity(contentUrl, String.class);
if (forEntity2.getStatusCode() == HttpStatus.OK) {
String body3 = forEntity2.getBody();
Pattern contentPattten = Pattern.compile("章节错误,点此举报(.*)加入书签,方便阅读");
String start = "『章节错误,点此举报』";
String end = "『加入书签,方便阅读』";
String content = body3.substring(body3.indexOf(start) + start.length(), body3.indexOf(end));
//TODO插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
bookIndex.setIndexNum(indexNum);
indexList.add(bookIndex);
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexNum(indexNum);
contentList.add(bookContent);
} else {
break;
}
}
indexNum++;
isFindIndex = indexListMatch.find();
}
if (indexList.size() == contentList.size() && indexList.size() > 0) {
bookService.saveBookAndIndexAndContent(book, indexList, contentList);
}
}
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
matcher2.find();
isFind = matcher2.find();//需要找两次,应为有两个一样的路径匹配
scoreFind = scoreMatch.find();
isBookNameMatch = bookNameMatch.find();
isFindAuthor = authoreMatch.find();
}
}
}
}