增加单本采集任务

This commit is contained in:
xiongxiaoyang
2020-06-15 15:08:15 +08:00
parent 3cbb6bf3fb
commit c9c714e71e
17 changed files with 738 additions and 381 deletions

View File

@ -3,11 +3,12 @@ package com.java2nb.novel.controller;
import com.github.pagehelper.PageInfo;
import com.java2nb.novel.core.bean.ResultBean;
import com.java2nb.novel.core.utils.BeanUtil;
import com.java2nb.novel.entity.CrawlSingleTask;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.service.CrawlService;
import com.java2nb.novel.vo.CrawlSingleTaskVO;
import com.java2nb.novel.vo.CrawlSourceVO;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
@ -56,6 +57,38 @@ public class CrawlController {
return ResultBean.ok();
}
/**
* 新增单本采集任务
* */
@PostMapping("addCrawlSingleTask")
public ResultBean addCrawlSingleTask(CrawlSingleTask singleTask){
crawlService.addCrawlSingleTask(singleTask);
return ResultBean.ok();
}
/**
* 单本采集任务分页列表查询
* */
@PostMapping("listCrawlSingleTaskByPage")
public ResultBean listCrawlSingleTaskByPage(@RequestParam(value = "curr", defaultValue = "1") int page, @RequestParam(value = "limit", defaultValue = "10") int pageSize){
return ResultBean.ok(new PageInfo<>(BeanUtil.copyList(crawlService.listCrawlSingleTaskByPage(page,pageSize), CrawlSingleTaskVO.class)
));
}
/**
* 删除采集任务
* */
@PostMapping("delCrawlSingleTask")
public ResultBean delCrawlSingleTask(Long id){
crawlService.delCrawlSingleTask(id);
return ResultBean.ok();
}

View File

@ -3,10 +3,7 @@ package com.java2nb.novel.core.listener;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.entity.*;
import com.java2nb.novel.service.BookService;
import com.java2nb.novel.service.CrawlService;
import com.java2nb.novel.utils.Constants;
@ -40,15 +37,15 @@ public class StarterListener implements ServletContextListener {
@Override
public void contextInitialized(ServletContextEvent sce) {
log.info("程序启动,开始执行自动更新线程。。。");
for(int i = 0 ; i<updateThreadCount; i++) {
for (int i = 0; i < updateThreadCount; i++) {
new Thread(() -> {
log.info("程序启动,开始执行自动更新线程。。。");
while (true) {
try {
//1.查询最新目录更新时间在一个月之内的前100条需要更新的数据
Date currentDate = new Date();
Date startDate = DateUtils.addDays(currentDate, -30);
List<Book> bookList ;
List<Book> bookList;
synchronized (this) {
bookList = bookService.queryNeedUpdateBook(startDate, 100);
}
@ -61,7 +58,7 @@ public class StarterListener implements ServletContextListener {
Book book = CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId());
//这里只做老书更新
book.setId(needUpdateBook.getId());
if(needUpdateBook.getPicUrl()!=null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
//本地图片则不更新
book.setPicUrl(null);
}
@ -83,6 +80,42 @@ public class StarterListener implements ServletContextListener {
}
}).start();
}
new Thread(() -> {
log.info("程序启动,开始执行单本采集任务线程。。。");
while (true) {
CrawlSingleTask task = null;
byte crawlStatus = 0;
try {
//获取采集任务
task = crawlService.getCrawlSingleTask();
if (task != null) {
//查询爬虫规则
CrawlSource source = crawlService.queryCrawlSource(task.getSourceId());
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
if (crawlService.parseBookAndSave(task.getCatId(), ruleBean, task.getSourceId(), task.getSourceBookId())) {
//采集成功
crawlStatus = 1;
}
}
Thread.sleep(1000 * 60);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
if (task != null) {
crawlService.updateCrawlSingleTask(task, crawlStatus);
}
}
}).start();
}
}

View File

@ -1,6 +1,7 @@
package com.java2nb.novel.service;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.entity.CrawlSingleTask;
import com.java2nb.novel.entity.CrawlSource;
import java.util.List;
@ -39,6 +40,16 @@ public interface CrawlService {
* */
void updateCrawlSourceStatus(Integer sourceId, Byte sourceStatus);
/**
* 采集并保存小说
* @param catId 分类ID
* @param bookId 小说ID
* @param sourceId 源ID
* @param ruleBean 采集规则\
* @return true:成功false:失败
* */
boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId);
/**
* 根据爬虫状态查询爬虫源集合
* @param sourceStatus 状态0关闭1开启
@ -61,4 +72,37 @@ public interface CrawlService {
* @return 源信息
* */
CrawlSource queryCrawlSource(Integer sourceId);
/**
* 新增单本采集任务
* @param singleTask 任务信息对象
* */
void addCrawlSingleTask(CrawlSingleTask singleTask);
/**
* 单本采集任务分页列表查询
* @param page 当前页码
* @param pageSize 分页大小
* @return 单本采集任务集合
* */
List<CrawlSingleTask> listCrawlSingleTaskByPage(int page, int pageSize);
/**
* 删除采集任务
* @param id 任务ID
* */
void delCrawlSingleTask(Long id);
/**
* 获取采集任务
* @return 采集任务
* */
CrawlSingleTask getCrawlSingleTask();
/**
* 更新单本采集任务
* @param task 采集任务
* @param status 采集状态
* */
void updateCrawlSingleTask(CrawlSingleTask task, Byte status);
}

View File

@ -6,12 +6,12 @@ import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.core.enums.ResponseStatus;
import com.java2nb.novel.core.exception.BusinessException;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.core.utils.SpringUtil;
import com.java2nb.novel.core.utils.ThreadUtil;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.entity.*;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.mapper.*;
import com.java2nb.novel.service.BookService;
@ -33,8 +33,7 @@ import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClient;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId;
import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*;
import static org.mybatis.dynamic.sql.SqlBuilder.isEqualTo;
import static org.mybatis.dynamic.sql.SqlBuilder.update;
import static org.mybatis.dynamic.sql.SqlBuilder.*;
import static org.mybatis.dynamic.sql.select.SelectDSL.select;
/**
@ -48,6 +47,8 @@ public class CrawlServiceImpl implements CrawlService {
private final CrawlSourceMapper crawlSourceMapper;
private final CrawlSingleTaskMapper crawlSingleTaskMapper;
private final BookService bookService;
@ -140,6 +141,62 @@ public class CrawlServiceImpl implements CrawlService {
return crawlSourceMapper.selectMany(render).get(0);
}
@Override
public void addCrawlSingleTask(CrawlSingleTask singleTask) {
if(bookService.queryIsExistByBookNameAndAuthorName(singleTask.getBookName(),singleTask.getAuthorName())){
throw new BusinessException(ResponseStatus.BOOK_EXISTS);
}
singleTask.setCreateTime(new Date());
crawlSingleTaskMapper.insertSelective(singleTask);
}
@Override
public List<CrawlSingleTask> listCrawlSingleTaskByPage(int page, int pageSize) {
PageHelper.startPage(page, pageSize);
SelectStatementProvider render = select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime.descending())
.build()
.render(RenderingStrategies.MYBATIS3);
return crawlSingleTaskMapper.selectMany(render);
}
@Override
public void delCrawlSingleTask(Long id) {
crawlSingleTaskMapper.deleteByPrimaryKey(id);
}
@Override
public CrawlSingleTask getCrawlSingleTask() {
List<CrawlSingleTask> list = crawlSingleTaskMapper.selectMany(select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
.where(CrawlSingleTaskDynamicSqlSupport.taskStatus,isEqualTo((byte)2))
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime)
.limit(1)
.build()
.render(RenderingStrategies.MYBATIS3));
return list.size() > 0 ? list.get(0) : null;
}
@Override
public void updateCrawlSingleTask(CrawlSingleTask task, Byte status) {
byte excCount = task.getExcCount();
excCount+=1;
task.setExcCount(excCount);
if(status == 1 || excCount == 5){
//当采集成功或者采集次数等于5则更新采集最终状态并停止采集
task.setTaskStatus(status);
}
crawlSingleTaskMapper.updateByPrimaryKeySelective(task);
}
/**
* 解析分类列表
*/
@ -173,35 +230,7 @@ public class CrawlServiceImpl implements CrawlService {
String bookId = bookIdMatcher.group(1);
Book book = CrawlParser.parseBook(ruleBean, bookId);
//这里只做新书入库,查询是否存在这本书
Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
//如果该小说不存在则可以解析入库但是标记该小说正在入库30分钟之后才允许再次入库
if (existBook == null) {
//没有该书,可以入库
book.setCatId(catId);
//根据分类ID查询分类
book.setCatName(bookService.queryCatNameByCatId(catId));
if (catId == 7) {
//女频
book.setWorkDirection((byte) 1);
} else {
//男频
book.setWorkDirection((byte) 0);
}
book.setCrawlBookId(bookId);
book.setCrawlSourceId(sourceId);
book.setCrawlLastTime(new Date());
book.setId(new IdWorker().nextId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
} else {
//只更新书籍的爬虫相关字段
bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
}
parseBookAndSave(catId, ruleBean, sourceId, bookId);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
@ -232,6 +261,43 @@ public class CrawlServiceImpl implements CrawlService {
}
@Override
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) {
Book book = CrawlParser.parseBook(ruleBean, bookId);
if(book.getBookName() == null || book.getAuthorName() == null){
return false;
}
//这里只做新书入库,查询是否存在这本书
Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
//如果该小说不存在则可以解析入库但是标记该小说正在入库30分钟之后才允许再次入库
if (existBook == null) {
//没有该书,可以入库
book.setCatId(catId);
//根据分类ID查询分类
book.setCatName(bookService.queryCatNameByCatId(catId));
if (catId == 7) {
//女频
book.setWorkDirection((byte) 1);
} else {
//男频
book.setWorkDirection((byte) 0);
}
book.setCrawlBookId(bookId);
book.setCrawlSourceId(sourceId);
book.setCrawlLastTime(new Date());
book.setId(new IdWorker().nextId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
} else {
//只更新书籍的爬虫相关字段
bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
}
return true;
}
@Override
public void updateCrawlSourceStatus(Integer sourceId, Byte sourceStatus) {
CrawlSource source = new CrawlSource();

View File

@ -0,0 +1,26 @@
package com.java2nb.novel.vo;
import com.fasterxml.jackson.annotation.JsonFormat;
import com.java2nb.novel.entity.CrawlSingleTask;
import com.java2nb.novel.entity.CrawlSource;
import lombok.Data;
import java.util.Date;
/**
* @author Administrator
*/
@Data
public class CrawlSingleTaskVO extends CrawlSingleTask {
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm")
private Date createTime;
@Override
public String toString() {
return super.toString();
}
}