上传代码

This commit is contained in:
xxy
2020-05-02 15:05:21 +08:00
parent c8c80fa719
commit ed34c67d08
733 changed files with 61899 additions and 0 deletions

View File

@ -0,0 +1,25 @@
package com.java2nb.novel;
import org.mybatis.spring.annotation.MapperScan;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.cache.annotation.EnableCaching;
import org.springframework.scheduling.annotation.EnableScheduling;
/**
* @author Administrator
*/
@SpringBootApplication
@EnableCaching
@EnableScheduling
@ServletComponentScan
@MapperScan(basePackages = {"com.java2nb.novel.mapper"})
public class CrawlNovelApplication {
public static void main(String[] args) {
SpringApplication.run(CrawlNovelApplication.class);
}
}

View File

@ -0,0 +1,62 @@
package com.java2nb.novel.controller;
import com.github.pagehelper.PageInfo;
import com.java2nb.novel.core.bean.ResultBean;
import com.java2nb.novel.core.utils.BeanUtil;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.service.CrawlService;
import com.java2nb.novel.vo.CrawlSourceVO;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
/**
* @author Administrator
*/
@RestController
@RequestMapping("crawl")
@RequiredArgsConstructor
public class CrawlController {
private final CrawlService crawlService;
/**
* 新增爬虫源
* */
@PostMapping("addCrawlSource")
public ResultBean addCrawlSource(CrawlSource source){
crawlService.addCrawlSource(source);
return ResultBean.ok();
}
/**
* 爬虫源分页列表查询
* */
@PostMapping("listCrawlByPage")
public ResultBean listCrawlByPage(@RequestParam(value = "curr", defaultValue = "1") int page, @RequestParam(value = "limit", defaultValue = "10") int pageSize){
return ResultBean.ok(new PageInfo<>(BeanUtil.copyList(crawlService.listCrawlByPage(page,pageSize), CrawlSourceVO.class)
));
}
/**
* 开启或停止爬虫
* */
@PostMapping("openOrCloseCrawl")
public ResultBean openOrCloseCrawl(Integer sourceId,Byte sourceStatus){
crawlService.openOrCloseCrawl(sourceId,sourceStatus);
return ResultBean.ok();
}
}

View File

@ -0,0 +1,50 @@
package com.java2nb.novel.controller;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.entity.News;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import java.util.List;
/**
* @author 11797
*/
@Slf4j
@RequiredArgsConstructor
@Controller
public class PageController {
@RequestMapping("{url}.html")
public String module(@PathVariable("url") String url) {
return url;
}
@RequestMapping("{module}/{url}.html")
public String module2(@PathVariable("module") String module, @PathVariable("url") String url) {
return module + "/" + url;
}
@RequestMapping("{module}/{classify}/{url}.html")
public String module3(@PathVariable("module") String module, @PathVariable("classify") String classify, @PathVariable("url") String url) {
return module + "/" + classify + "/" + url;
}
/**
* 首页
* */
@RequestMapping(path = {"/", "/index", "/index.html"})
public String index() {
return "crawl/crawlSource_list";
}
}

View File

@ -0,0 +1,64 @@
package com.java2nb.novel.core.config;
import lombok.RequiredArgsConstructor;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.security.config.annotation.authentication.builders.AuthenticationManagerBuilder;
import org.springframework.security.config.annotation.web.builders.HttpSecurity;
import org.springframework.security.config.annotation.web.builders.WebSecurity;
import org.springframework.security.config.annotation.web.configuration.EnableWebSecurity;
import org.springframework.security.config.annotation.web.configuration.WebSecurityConfigurerAdapter;
import org.springframework.security.core.userdetails.User;
import org.springframework.security.crypto.bcrypt.BCryptPasswordEncoder;
import org.springframework.security.crypto.password.PasswordEncoder;
/**
* SpringSecurity配置
* @author Administrator
*/
@Configuration
@EnableWebSecurity
@RequiredArgsConstructor
public class SecurityConfiguration extends WebSecurityConfigurerAdapter {
@Value("${admin.username}")
private String username;
@Value("${admin.password}")
private String password;
@Bean
public PasswordEncoder passwordEncoder() {
return new BCryptPasswordEncoder();
}
@Override
public void configure(WebSecurity web) throws Exception {
super.configure(web);
}
@Override
public void configure(AuthenticationManagerBuilder auth) throws Exception {
User.UserBuilder builder = User.builder().passwordEncoder(passwordEncoder()::encode);
auth.inMemoryAuthentication().withUser(builder.username(username).password(password).roles("ADMIN").build());
}
@Override
protected void configure(HttpSecurity http) throws Exception {
http.csrf().disable()//禁用了 csrf 功能
.authorizeRequests()//限定签名成功的请求
.antMatchers("/**").hasRole("ADMIN")
.anyRequest().permitAll()//其他没有限定的请求,允许访问
.and().anonymous()//对于没有配置权限的其他请求允许匿名访问
.and().formLogin()//使用 spring security 默认登录页面
.and().httpBasic();//启用http 基础验证
}
}

View File

@ -0,0 +1,77 @@
package com.java2nb.novel.core.listener;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.service.BookService;
import com.java2nb.novel.service.CrawlService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.time.DateUtils;
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
import javax.servlet.annotation.WebListener;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author Administrator
*/
@WebListener
@Slf4j
@RequiredArgsConstructor
public class StarterListener implements ServletContextListener {
private final BookService bookService;
private final CrawlService crawlService;
@Override
public void contextInitialized(ServletContextEvent sce) {
log.info("程序启动,开始执行自动更新线程。。。");
new Thread(() -> {
while (true) {
try {
//1.查询最新目录更新时间在一个月之内的前100条需要更新的数据
Date currentDate = new Date();
Date startDate = DateUtils.addDays(currentDate, -30);
List<Book> bookList = bookService.queryNeedUpdateBook(startDate, 100);
for (Book needUpdateBook : bookList) {
try {
//查询爬虫源规则
CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId());
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
//解析小说基本信息
Book book = CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId());
//这里只做老书更新
book.setCrawlLastTime(currentDate);
book.setId(needUpdateBook.getId());
//查询已存在的章节
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(),book, ruleBean, existBookIndexMap);
bookService.updateBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
}catch (Exception e){
log.error(e.getMessage(), e);
//解析异常中断,更新一下小说的最后解析时间
bookService.updateCrawlLastTime(needUpdateBook.getId());
}
}
Thread.sleep(1000 * 60 * 10);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
}).start();
}
}

View File

@ -0,0 +1,63 @@
package com.java2nb.novel.core.schedule;
import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.utils.ThreadUtil;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.service.CrawlService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* 爬虫线程监控器,监控执行完成的爬虫源,并修改状态
*
* @author Administrator
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CrawlThreadMonitor {
private final CacheService cacheService;
private final CrawlService crawlService;
@Scheduled(fixedRate = 1000 * 60 * 5)
public void monitor() {
//查询需要监控的正在运行的爬虫源
List<CrawlSource> sources = crawlService.queryCrawlSourceByStatus((byte) 1);
for (CrawlSource source : sources) {
Set<Long> runningCrawlThreadIds = (Set<Long>) cacheService.getObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + source.getId());
boolean sourceStop = true;
if (runningCrawlThreadIds != null) {
for (Long threadId : runningCrawlThreadIds) {
Thread thread = ThreadUtil.findThread(threadId);
if (thread != null && thread.isAlive()) {
//有活跃线程,说明该爬虫源正在运行,数据库中状态正确,不需要修改
sourceStop = false;
}
}
}
if (sourceStop) {
crawlService.updateCrawlSourceStatus(source.getId(), (byte) 0);
}
}
}
}

View File

@ -0,0 +1,16 @@
package com.java2nb.novel.mapper;
import com.java2nb.novel.entity.BookIndex;
import org.apache.ibatis.annotations.Param;
/**
* @author Administrator
*/
public interface CrawlBookIndexMapper extends BookIndexMapper {
/**
* 查询最后的章节
* */
BookIndex queryLastIndex(@Param("bookId") Long bookId);
}

View File

@ -0,0 +1,28 @@
package com.java2nb.novel.mapper;
import com.java2nb.novel.entity.Book;
import org.apache.ibatis.annotations.Param;
import java.util.Date;
import java.util.List;
/**
* @author Administrator
*/
public interface CrawlBookMapper extends BookMapper {
/**
* 查询需要更新的小说数据
* @param startDate 最新更新时间的起始时间
* @param limit 查询条数
* @return 小说集合
* */
List<Book> queryNeedUpdateBook(@Param("startDate") Date startDate, @Param("limit") int limit);
/**
* 查询小说总字数
* @param bookId 小说ID
* @return 小说总字数
* */
Integer queryTotalWordCount(@Param("bookId") Long bookId);
}

View File

@ -0,0 +1,77 @@
package com.java2nb.novel.service;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author Administrator
*/
public interface BookService {
/**
* 根据小说名和作者名查询是否存在
* @param bookName 小说名
* @param authorName 作者名
* @return 是否存在该小说名和作者名的小说
*/
boolean queryIsExistByBookNameAndAuthorName(String bookName, String authorName);
/**
* 更新书籍的爬虫属性
* @param sourceId 爬虫源ID
* @param bookId 源站小说ID
* */
void updateCrawlProperties(Integer sourceId, String bookId);
/**
* 通过分类ID查询分类名
* @param catId 分类ID
* @return 分类名
* */
String queryCatNameByCatId(int catId);
/**
* 保存小说表,目录表,内容表数据
* @param book 小说数据
* @param bookIndexList 目录集合
* @param bookContentList 内容集合
* */
void saveBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList);
/**
* 查询需要更新的小说数据
*
* @param startDate 最新更新时间的起始时间
* @param limit 查询条数
* @return 小说集合
* */
List<Book> queryNeedUpdateBook(Date startDate, int limit);
/**
* 查询已存在的章节
* @param bookId 小说ID
* @return 章节号和章节数据对映射map
* */
Map<Integer,BookIndex> queryExistBookIndexMap(Long bookId);
/**
* 更新小说表,目录表,内容表数据
* @param book 小说数据
* @param bookIndexList 目录集合
* @param bookContentList 内容集合
* */
void updateBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList);
/**
* 更新一下最后一次的抓取时间
* @param bookId 小说ID
* */
void updateCrawlLastTime(Long bookId);
}

View File

@ -0,0 +1,64 @@
package com.java2nb.novel.service;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.entity.CrawlSource;
import java.util.List;
/**
* @author Administrator
*/
public interface CrawlService {
/**
* 新增爬虫源
* @param source 爬虫源提交的数据对象
* */
void addCrawlSource(CrawlSource source);
/**
* 爬虫源分页列表
* @param page 当前页码
* @param pageSize 分页大小
*@return 爬虫源集合
* */
List<CrawlSource> listCrawlByPage(int page, int pageSize);
/**
* 开启或停止爬虫
* @param sourceId 爬虫源ID
* @param sourceStatus 状态0关闭1开启
* */
void openOrCloseCrawl(Integer sourceId, Byte sourceStatus);
/**
* 更新爬虫状态
* @param sourceId 爬虫源ID
* @param sourceStatus 状态0关闭1开启
* */
void updateCrawlSourceStatus(Integer sourceId, Byte sourceStatus);
/**
* 根据爬虫状态查询爬虫源集合
* @param sourceStatus 状态0关闭1开启
* @return 返回爬虫源集合
* */
List<CrawlSource> queryCrawlSourceByStatus(Byte sourceStatus);
/**
* 根据分类ID和规则解析分类列表
* @param catId 分类ID
* @param ruleBean 规则对象
* @param sourceId
*/
void parseBookList(int catId, RuleBean ruleBean, Integer sourceId);
/**
* 查询爬虫源
* @param sourceId 源ID
* @return 源信息
* */
CrawlSource queryCrawlSource(Integer sourceId);
}

View File

@ -0,0 +1,194 @@
package com.java2nb.novel.service.impl;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.mapper.*;
import com.java2nb.novel.service.BookService;
import lombok.RequiredArgsConstructor;
import org.mybatis.dynamic.sql.render.RenderingStrategies;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId;
import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.id;
import static org.mybatis.dynamic.sql.SqlBuilder.*;
import static org.mybatis.dynamic.sql.select.SelectDSL.select;
/**
* @author Administrator
*/
@Service
@RequiredArgsConstructor
public class BookServiceImpl implements BookService {
private final CrawlBookMapper bookMapper;
private final BookCategoryMapper bookCategoryMapper;
private final CrawlBookIndexMapper bookIndexMapper;
private final BookContentMapper bookContentMapper;
@Override
public boolean queryIsExistByBookNameAndAuthorName(String bookName, String authorName) {
return bookMapper.count(countFrom(BookDynamicSqlSupport.book).where(BookDynamicSqlSupport.bookName, isEqualTo(bookName))
.and(BookDynamicSqlSupport.authorName, isEqualTo(authorName))
.build()
.render(RenderingStrategies.MYBATIS3))>0;
}
@Override
public void updateCrawlProperties(Integer sourceId, String bookId) {
bookMapper.update(update(BookDynamicSqlSupport.book)
.set(crawlSourceId)
.equalTo(sourceId)
.set(crawlBookId)
.equalTo(bookId)
.build()
.render(RenderingStrategies.MYBATIS3));
}
@Override
public String queryCatNameByCatId(int catId) {
return bookCategoryMapper.selectMany(select(BookCategoryDynamicSqlSupport.name)
.from(BookCategoryDynamicSqlSupport.bookCategory)
.where(id, isEqualTo(catId))
.build()
.render(RenderingStrategies.MYBATIS3)).get(0).getName();
}
@Transactional(rollbackFor = Exception.class)
@Override
public void saveBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList) {
if(!queryIsExistByBookNameAndAuthorName(book.getBookName(),book.getAuthorName())) {
if(bookIndexList.size()>0) {
if (book.getId() == null) {
book.setId(new IdWorker().nextId());
}
//保存小说主表
bookMapper.insertSelective(book);
//批量保存目录和内容
bookIndexMapper.insertMultiple(bookIndexList);
bookContentMapper.insertMultiple(bookContentList);
}
}
}
@Override
public List<Book> queryNeedUpdateBook(Date startDate, int limit) {
return bookMapper.queryNeedUpdateBook(startDate, limit);
}
@Override
public Map<Integer, BookIndex> queryExistBookIndexMap(Long bookId) {
List<BookIndex> bookIndexs = bookIndexMapper.selectMany(select(BookIndexDynamicSqlSupport.id,BookIndexDynamicSqlSupport.indexNum,BookIndexDynamicSqlSupport.indexName)
.from(BookIndexDynamicSqlSupport.bookIndex)
.where(BookIndexDynamicSqlSupport.bookId,isEqualTo(bookId))
.build()
.render(RenderingStrategies.MYBATIS3));
if (bookIndexs.size() > 0) {
return bookIndexs.stream().collect(Collectors.toMap(BookIndex::getIndexNum, Function.identity()));
}
return new HashMap<>(0);
}
@Transactional(rollbackFor = Exception.class)
@Override
public void updateBookAndIndexAndContent(Book book, List<BookIndex> bookIndexList, List<BookContent> bookContentList) {
Date currentDate = new Date();
for (int i = 0; i < bookIndexList.size(); i++) {
BookIndex bookIndex = bookIndexList.get(i);
BookContent bookContent = bookContentList.get(i);
//插入或更新目录
Integer wordCount = bookContent.getContent().length();
bookIndex.setWordCount(wordCount);
bookIndex.setUpdateTime(currentDate);
if(bookIndex.getId() == null) {
//插入
bookIndex.setBookId(book.getId());
Long indexId = new IdWorker().nextId();
bookIndex.setId(indexId);
bookIndex.setCreateTime(currentDate);
bookIndexMapper.insertSelective(bookIndex);
}else{
//更新
bookIndexMapper.updateByPrimaryKeySelective(bookIndex);
}
if(bookContent.getIndexId() == null) {
//插入
bookContent.setIndexId(bookIndex.getId());
bookContentMapper.insertSelective(bookContent);
}else{
//更新
bookContentMapper.update(update(BookContentDynamicSqlSupport.bookContent)
.set(BookContentDynamicSqlSupport.content)
.equalTo(bookContent.getContent())
.where(BookContentDynamicSqlSupport.indexId,isEqualTo(bookContent.getIndexId()))
.build()
.render(RenderingStrategies.MYBATIS3));
}
}
//更新小说主表
book.setWordCount(queryTotalWordCount(book.getId()));
BookIndex lastIndex = queryLastIndex(book.getId());
book.setLastIndexId(lastIndex.getId());
book.setLastIndexName(lastIndex.getIndexName());
book.setLastIndexUpdateTime(lastIndex.getUpdateTime());
book.setUpdateTime(currentDate);
book.setBookName(null);
book.setAuthorName(null);
bookMapper.updateByPrimaryKeySelective(book);
}
@Override
public void updateCrawlLastTime(Long bookId) {
Book book = new Book();
book.setId(bookId);
book.setCrawlLastTime(new Date());
bookMapper.updateByPrimaryKeySelective(book);
}
/**
* 查询最后的章节
* */
private BookIndex queryLastIndex(Long bookId) {
return bookIndexMapper.queryLastIndex(bookId);
}
/**
* 查询小说总字数
* */
private Integer queryTotalWordCount(Long bookId) {
return bookMapper.queryTotalWordCount(bookId);
}
}

View File

@ -0,0 +1,245 @@
package com.java2nb.novel.service.impl;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.pagehelper.PageHelper;
import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.core.utils.SpringUtil;
import com.java2nb.novel.core.utils.ThreadUtil;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.mapper.*;
import com.java2nb.novel.service.BookService;
import com.java2nb.novel.service.CrawlService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.mybatis.dynamic.sql.render.RenderingStrategies;
import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClient;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId;
import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*;
import static org.mybatis.dynamic.sql.SqlBuilder.isEqualTo;
import static org.mybatis.dynamic.sql.SqlBuilder.update;
import static org.mybatis.dynamic.sql.select.SelectDSL.select;
/**
* @author Administrator
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CrawlServiceImpl implements CrawlService {
private final CrawlSourceMapper crawlSourceMapper;
private final BookService bookService;
private final CacheService cacheService;
@Override
public void addCrawlSource(CrawlSource source) {
Date currentDate = new Date();
source.setCreateTime(currentDate);
source.setUpdateTime(currentDate);
crawlSourceMapper.insertSelective(source);
}
@Override
public List<CrawlSource> listCrawlByPage(int page, int pageSize) {
PageHelper.startPage(page, pageSize);
SelectStatementProvider render = select(id, sourceName, sourceStatus, createTime, updateTime)
.from(crawlSource)
.orderBy(updateTime)
.build()
.render(RenderingStrategies.MYBATIS3);
return crawlSourceMapper.selectMany(render);
}
@SneakyThrows
@Override
public void openOrCloseCrawl(Integer sourceId, Byte sourceStatus) {
//判断是开启还是关闭,如果是关闭,则修改数据库状态后获取该爬虫正在运行的线程集合并全部停止
//如果是开启,先查询数据库中状态,判断该爬虫源是否还在运行,如果在运行,则忽略,
// 如果没有则修改数据库状态并启动线程爬取小说数据加入到runningCrawlThread中
if (sourceStatus == (byte) 0) {
//关闭,直接修改数据库状态,并直接修改数据库状态后获取该爬虫正在运行的线程集合全部停止
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus);
Set<Long> runningCrawlThreadId = (Set<Long>) cacheService.getObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId);
if (runningCrawlThreadId != null) {
for (Long ThreadId : runningCrawlThreadId) {
Thread thread = ThreadUtil.findThread(ThreadId);
if (thread != null && thread.isAlive()) {
thread.interrupt();
}
}
}
} else {
//开启
//查询爬虫源状态和规则
CrawlSource source = queryCrawlSource(sourceId);
Byte realSourceStatus = source.getSourceStatus();
if (realSourceStatus == (byte) 0) {
//该爬虫源已经停止运行了,修改数据库状态并启动线程爬取小说数据加入到runningCrawlThread中
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus);
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
Set<Long> threadIds = new HashSet<>();
//按分类开始爬虫解析任务
for (int i = 1; i < 8; i++) {
final int catId = i;
Thread thread = new Thread(() -> {
parseBookList(catId, ruleBean, sourceId);
});
thread.start();
//thread加入到监控缓存中
threadIds.add(thread.getId());
}
cacheService.setObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId, threadIds);
}
}
}
@Override
public CrawlSource queryCrawlSource(Integer sourceId) {
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
.from(crawlSource)
.where(id, isEqualTo(sourceId))
.build()
.render(RenderingStrategies.MYBATIS3);
return crawlSourceMapper.selectMany(render).get(0);
}
/**
* 解析分类列表
*/
@Override
public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) {
//当前页码1
int page = 1;
int totalPage = page;
while (page <= totalPage) {
try {
//拼接分类URL
String catBookListUrl = ruleBean.getBookListUrl()
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
.replace("{page}", page + "");
String bookListHtml = getByHttpClient(catBookListUrl);
if (bookListHtml != null) {
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
boolean isFindBookId = bookIdMatcher.find();
while (isFindBookId) {
try {
String bookId = bookIdMatcher.group(1);
Book book = CrawlParser.parseBook(ruleBean, bookId);
//这里只做新书入库,查询是否存在这本书
boolean isExist = bookService.queryIsExistByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
//如果该小说不存在则可以解析入库但是标记该小说正在入库30分钟之后才允许再次入库
if (!isExist && StringUtils.isBlank(cacheService.get(CacheKey.NEW_BOOK_IN_SAVE + book.getBookName() + "-" + book.getAuthorName()))) {
//没有该书,可以入库
cacheService.set(CacheKey.NEW_BOOK_IN_SAVE + book.getBookName() + "-" + book.getAuthorName(), "true", 60 * 30);
book.setCatId(catId);
//根据分类ID查询分类
book.setCatName(bookService.queryCatNameByCatId(catId));
if (catId == 7) {
//女频
book.setWorkDirection((byte) 1);
} else {
//男频
book.setWorkDirection((byte) 0);
}
book.setCrawlBookId(bookId);
book.setCrawlSourceId(sourceId);
book.setCrawlLastTime(new Date());
book.setId(new IdWorker().nextId());
//解析章节目录
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId,book, ruleBean, new HashMap<>(0));
bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
} else {
//只更新书籍的爬虫相关字段
bookService.updateCrawlProperties(sourceId, bookId);
}
}catch (Exception e){
log.error(e.getMessage(),e);
}
isFindBookId = bookIdMatcher.find();
}
Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten());
Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml);
boolean isFindTotalPage = totalPageMatcher.find();
if (isFindTotalPage) {
totalPage = Integer.parseInt(totalPageMatcher.group(1));
}
}
}catch (Exception e){
log.error(e.getMessage(),e);
}
page += 1;
}
}
@Override
public void updateCrawlSourceStatus(Integer sourceId, Byte sourceStatus) {
CrawlSource source = new CrawlSource();
source.setId(sourceId);
source.setSourceStatus(sourceStatus);
crawlSourceMapper.updateByPrimaryKeySelective(source);
}
@Override
public List<CrawlSource> queryCrawlSourceByStatus(Byte sourceStatus) {
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.id, CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
.from(crawlSource)
.where(CrawlSourceDynamicSqlSupport.sourceStatus, isEqualTo(sourceStatus))
.build()
.render(RenderingStrategies.MYBATIS3);
return crawlSourceMapper.selectMany(render);
}
}

View File

@ -0,0 +1,28 @@
package com.java2nb.novel.vo;
import com.fasterxml.jackson.annotation.JsonFormat;
import com.java2nb.novel.entity.CrawlSource;
import lombok.Data;
import javax.annotation.Generated;
import java.util.Date;
/**
* @author Administrator
*/
@Data
public class CrawlSourceVO extends CrawlSource{
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm")
private Date createTime;
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm")
private Date updateTime;
@Override
public String toString() {
return super.toString();
}
}