diff --git a/novel-admin/pom.xml b/novel-admin/pom.xml index c2cd133..2e56b74 100644 --- a/novel-admin/pom.xml +++ b/novel-admin/pom.xml @@ -106,7 +106,7 @@ org.apache.shiro shiro-spring - 1.3.2 + 1.7.0 diff --git a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java index 75a585a..750c53f 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java @@ -65,4 +65,8 @@ public interface CacheKey { * 累积的小说点击量 * */ String BOOK_ADD_VISIT_COUNT = "bookAddVisitCount"; -} \ No newline at end of file + /** + * 测试爬虫规则缓存 + */ + String BOOK_TEST_PARSE = "testParse"; +} diff --git a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java index cefdb30..6b312fc 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java @@ -1,5 +1,8 @@ package com.java2nb.novel.controller; +import com.java2nb.novel.core.cache.CacheKey; +import com.java2nb.novel.core.cache.CacheService; +import com.java2nb.novel.core.utils.HttpUtil; import io.github.xxyopen.model.page.PageBean; import com.java2nb.novel.entity.CrawlSingleTask; @@ -9,6 +12,11 @@ import io.github.xxyopen.model.resp.RestResult; import lombok.RequiredArgsConstructor; import org.springframework.web.bind.annotation.*; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * @author Administrator */ @@ -19,7 +27,7 @@ public class CrawlController { private final CrawlService crawlService; - + private final CacheService cacheService; /** * 新增爬虫源 * */ @@ -39,7 +47,70 @@ public class CrawlController { return RestResult.ok(crawlService.listCrawlByPage(page,pageSize)); } + /** + * 获取爬虫源 + * */ + @GetMapping("getCrawlSource/{id}") + public RestResult getCrawlSource(@PathVariable("id") Integer id){ + CrawlSource crawlSource= crawlService.getCrawlSource(id); + return RestResult.ok(crawlSource); + } + + /** + * 测试规则 + * @param rule + * @param url + * @param isRefresh + * @return + */ + @PostMapping("testParse") + public RestResult testParse(String rule,String url,String isRefresh){ + + Map resultMap=new HashMap<>(); + String html =null; + if(url.startsWith("https://")||url.startsWith("http://")){ + String refreshCache="1"; + if(!refreshCache.equals(isRefresh)) { + Object cache = cacheService.getObject(CacheKey.BOOK_TEST_PARSE + url); + if (cache == null) { + isRefresh="1"; + }else { + html = (String) cache; + } + } + if(refreshCache.equals(isRefresh)){ + html = HttpUtil.getByHttpClientWithChrome(url); + if (html != null) { + cacheService.setObject(CacheKey.BOOK_TEST_PARSE + url, html, 60 * 10); + }else{ + resultMap.put("msg","html is null"); + return RestResult.ok(resultMap); + } + } + }else{ + resultMap.put("html","url is null"); + return RestResult.ok(resultMap); + } + Pattern pattern = Pattern.compile(rule); + Matcher matcher = pattern.matcher(html); + boolean isFind = matcher.find(); + resultMap.put("是否匹配",isFind); + if(isFind){ + resultMap.put("匹配结果",matcher.group(1)); + } + // resultMap.put("url",url); + return RestResult.ok(resultMap); + } + /** + * 修改爬虫源 + * */ + @PostMapping("updateCrawlSource") + public RestResult updateCrawlSource(CrawlSource source) { + crawlService.updateCrawlSource(source); + return RestResult.ok(); + + } /** * 开启或停止爬虫 * */ diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java index 38c6f4b..fdb36a5 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java @@ -18,7 +18,11 @@ public interface CrawlService { * */ void addCrawlSource(CrawlSource source); - + /** + * 修改爬虫源 + * @param source + */ + void updateCrawlSource(CrawlSource source); /** * 爬虫源分页列表 * @param page 当前页码 @@ -106,4 +110,11 @@ public interface CrawlService { * @param status 采集状态 * */ void updateCrawlSingleTask(CrawlSingleTask task, Byte status); + + /** + * 获取采集规则详细 + * @param id + * @return + */ + CrawlSource getCrawlSource(Integer id); } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index 028f44b..9b7b00c 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -70,7 +70,24 @@ public class CrawlServiceImpl implements CrawlService { crawlSourceMapper.insertSelective(source); } - + @Override + public void updateCrawlSource(CrawlSource source) { + if(source.getId()!=null){ + Optional opt=crawlSourceMapper.selectByPrimaryKey(source.getId()); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + if (crawlSource.getSourceStatus() == (byte) 1) { + //关闭 + openOrCloseCrawl(crawlSource.getId(),(byte)0); + } + Date currentDate = new Date(); + crawlSource.setUpdateTime(currentDate); + crawlSource.setCrawlRule(source.getCrawlRule()); + crawlSource.setSourceName(source.getSourceName()); + crawlSourceMapper.updateByPrimaryKey(crawlSource); + } + } + } @Override public PageBean listCrawlByPage(int page, int pageSize) { PageHelper.startPage(page, pageSize); @@ -206,6 +223,16 @@ public class CrawlServiceImpl implements CrawlService { } + @Override + public CrawlSource getCrawlSource(Integer id) { + Optional opt=crawlSourceMapper.selectByPrimaryKey(id); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + return crawlSource; + } + return null; + } + /** * 解析分类列表 */ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html index b247301..f478835 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html @@ -30,6 +30,7 @@ 爬虫源管理 单本采集管理 + 规则测试 diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html index 786af82..a64b5d0 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html @@ -30,6 +30,7 @@ 爬虫源管理 单本采集管理 + 规则测试 diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 02833f4..7ee95d1 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -30,6 +30,7 @@ 爬虫源管理 单本采集管理 + 规则测试 diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html index d054411..272eabd 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html @@ -29,6 +29,7 @@ 爬虫源管理 单本采集管理 + 规则测试 @@ -38,7 +39,7 @@ 爬虫源列表 - 增加爬虫源 + 增加爬虫源 @@ -119,6 +120,7 @@ + + + + +