diff --git a/novel-admin/pom.xml b/novel-admin/pom.xml index c2cd133..2e56b74 100644 --- a/novel-admin/pom.xml +++ b/novel-admin/pom.xml @@ -106,7 +106,7 @@ org.apache.shiro shiro-spring - 1.3.2 + 1.7.0 diff --git a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java index 75a585a..750c53f 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java @@ -65,4 +65,8 @@ public interface CacheKey { * 累积的小说点击量 * */ String BOOK_ADD_VISIT_COUNT = "bookAddVisitCount"; -} \ No newline at end of file + /** + * 测试爬虫规则缓存 + */ + String BOOK_TEST_PARSE = "testParse"; +} diff --git a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java index cefdb30..6b312fc 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java @@ -1,5 +1,8 @@ package com.java2nb.novel.controller; +import com.java2nb.novel.core.cache.CacheKey; +import com.java2nb.novel.core.cache.CacheService; +import com.java2nb.novel.core.utils.HttpUtil; import io.github.xxyopen.model.page.PageBean; import com.java2nb.novel.entity.CrawlSingleTask; @@ -9,6 +12,11 @@ import io.github.xxyopen.model.resp.RestResult; import lombok.RequiredArgsConstructor; import org.springframework.web.bind.annotation.*; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * @author Administrator */ @@ -19,7 +27,7 @@ public class CrawlController { private final CrawlService crawlService; - + private final CacheService cacheService; /** * 新增爬虫源 * */ @@ -39,7 +47,70 @@ public class CrawlController { return RestResult.ok(crawlService.listCrawlByPage(page,pageSize)); } + /** + * 获取爬虫源 + * */ + @GetMapping("getCrawlSource/{id}") + public RestResult getCrawlSource(@PathVariable("id") Integer id){ + CrawlSource crawlSource= crawlService.getCrawlSource(id); + return RestResult.ok(crawlSource); + } + + /** + * 测试规则 + * @param rule + * @param url + * @param isRefresh + * @return + */ + @PostMapping("testParse") + public RestResult testParse(String rule,String url,String isRefresh){ + + Map resultMap=new HashMap<>(); + String html =null; + if(url.startsWith("https://")||url.startsWith("http://")){ + String refreshCache="1"; + if(!refreshCache.equals(isRefresh)) { + Object cache = cacheService.getObject(CacheKey.BOOK_TEST_PARSE + url); + if (cache == null) { + isRefresh="1"; + }else { + html = (String) cache; + } + } + if(refreshCache.equals(isRefresh)){ + html = HttpUtil.getByHttpClientWithChrome(url); + if (html != null) { + cacheService.setObject(CacheKey.BOOK_TEST_PARSE + url, html, 60 * 10); + }else{ + resultMap.put("msg","html is null"); + return RestResult.ok(resultMap); + } + } + }else{ + resultMap.put("html","url is null"); + return RestResult.ok(resultMap); + } + Pattern pattern = Pattern.compile(rule); + Matcher matcher = pattern.matcher(html); + boolean isFind = matcher.find(); + resultMap.put("是否匹配",isFind); + if(isFind){ + resultMap.put("匹配结果",matcher.group(1)); + } + // resultMap.put("url",url); + return RestResult.ok(resultMap); + } + /** + * 修改爬虫源 + * */ + @PostMapping("updateCrawlSource") + public RestResult updateCrawlSource(CrawlSource source) { + crawlService.updateCrawlSource(source); + return RestResult.ok(); + + } /** * 开启或停止爬虫 * */ diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java index 38c6f4b..fdb36a5 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java @@ -18,7 +18,11 @@ public interface CrawlService { * */ void addCrawlSource(CrawlSource source); - + /** + * 修改爬虫源 + * @param source + */ + void updateCrawlSource(CrawlSource source); /** * 爬虫源分页列表 * @param page 当前页码 @@ -106,4 +110,11 @@ public interface CrawlService { * @param status 采集状态 * */ void updateCrawlSingleTask(CrawlSingleTask task, Byte status); + + /** + * 获取采集规则详细 + * @param id + * @return + */ + CrawlSource getCrawlSource(Integer id); } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index 028f44b..9b7b00c 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -70,7 +70,24 @@ public class CrawlServiceImpl implements CrawlService { crawlSourceMapper.insertSelective(source); } - + @Override + public void updateCrawlSource(CrawlSource source) { + if(source.getId()!=null){ + Optional opt=crawlSourceMapper.selectByPrimaryKey(source.getId()); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + if (crawlSource.getSourceStatus() == (byte) 1) { + //关闭 + openOrCloseCrawl(crawlSource.getId(),(byte)0); + } + Date currentDate = new Date(); + crawlSource.setUpdateTime(currentDate); + crawlSource.setCrawlRule(source.getCrawlRule()); + crawlSource.setSourceName(source.getSourceName()); + crawlSourceMapper.updateByPrimaryKey(crawlSource); + } + } + } @Override public PageBean listCrawlByPage(int page, int pageSize) { PageHelper.startPage(page, pageSize); @@ -206,6 +223,16 @@ public class CrawlServiceImpl implements CrawlService { } + @Override + public CrawlSource getCrawlSource(Integer id) { + Optional opt=crawlSourceMapper.selectByPrimaryKey(id); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + return crawlSource; + } + return null; + } + /** * 解析分类列表 */ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html index b247301..f478835 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html @@ -30,6 +30,7 @@ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html index 786af82..a64b5d0 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html @@ -30,6 +30,7 @@ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 02833f4..7ee95d1 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -30,6 +30,7 @@ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html index d054411..272eabd 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html @@ -29,6 +29,7 @@ @@ -38,7 +39,7 @@

爬虫源列表

- +
@@ -119,6 +120,7 @@ + + + + + diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html new file mode 100644 index 0000000..7d2443a --- /dev/null +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html @@ -0,0 +1,522 @@ + + + + + + + 爬虫管理系统-小说精品屋 + + + + + + +
+ +
+ +
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + +
+
+
+

爬虫源信息填写(示例均为顶点小说网:dingdiann.com)

+
    +
  • + 示例:新顶点小说网 +
  • + + 示例:http://m.xdingdiann.com/sort/{catId}/{page}.html ({catId}代表分类ID,{page}代表分页页码) +
  • + 示例:1 +
  • + 示例:2 +
  • + 示例:3 +
  • + 示例:4 +
  • + 示例:5 +
  • + 示例:6 +
  • + 示例:7 +
  • + 示例:href="/ddk(\d+)/" +
  • + value="(\d+)/\d+" +
  • + value="\d+/(\d+)" +
  • + http://m.xdingdiann.com/ddk{bookId} (bookId代表小说ID) +
  • + 示例:<p class="title">([^/]+)</p> +
  • + 示例:作者:([^/]+)< +
  • + 示例:<img src="([^>]+)"\s+onerror="this.src= +
  • + 可空,适用于图片路径为相对路径的源站,加上小说图片路径,则为完整的可访问的图片路径 +
  • + 示例:状态:([^/]+)</li> +
  • + 示例:连载 +
  • + 示例:完结 +
  • + 示例:<div\s+class="score">(\d+\.\d+)分</div> +
  • + 示例: +
  • + 示例:<p class="review"> +
  • + 示例:</p> +
  • +
  • + 示例:更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a> +
  • + 示例:yyyy-MM-dd HH:mm:ss +
  • + 示例:http://m.xdingdiann.com/ddk{bookId}/all.html (bookId代表小说ID) +
  • + 可空,适用于最新章节列表和全部章节列表在同一个页面的源站 +
  • + 示例:<a\s+style=""\s+href="/ddk\d+/(\d+)\.html">[^/]+</a> +
  • + 示例:<a\s+style=""\s+href="/ddk\d+/\d+\.html">([^/]+)</a> +
  • + 示例:http://m.xdingdiann.com/ddk{bookId}/{indexId}.html + (bookId代表小说ID,{indexId}代表目录ID) +
  • + 示例:id="content"> +
  • + 示例:<script> +
  • + +
  • +
+
+
+
+ +
+
+
+
+ + + + + + + + +