diff --git a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java index 75a585a..750c53f 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java @@ -65,4 +65,8 @@ public interface CacheKey { * 累积的小说点击量 * */ String BOOK_ADD_VISIT_COUNT = "bookAddVisitCount"; -} \ No newline at end of file + /** + * 测试爬虫规则缓存 + */ + String BOOK_TEST_PARSE = "testParse"; +} diff --git a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java index 80c9c16..1c4cf42 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java @@ -1,13 +1,26 @@ package com.java2nb.novel.controller; +import com.fasterxml.jackson.databind.ObjectMapper; import com.java2nb.novel.core.bean.PageBean; import com.java2nb.novel.core.bean.ResultBean; +import com.java2nb.novel.core.cache.CacheKey; +import com.java2nb.novel.core.cache.CacheService; +import com.java2nb.novel.core.crawl.CrawlParser; +import com.java2nb.novel.core.crawl.RuleBean; +import com.java2nb.novel.core.utils.HttpUtil; +import com.java2nb.novel.entity.BookIndex; import com.java2nb.novel.entity.CrawlSingleTask; import com.java2nb.novel.entity.CrawlSource; import com.java2nb.novel.service.CrawlService; +import com.java2nb.novel.utils.Constants; import lombok.RequiredArgsConstructor; import org.springframework.web.bind.annotation.*; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * @author Administrator */ @@ -18,7 +31,7 @@ public class CrawlController { private final CrawlService crawlService; - + private final CacheService cacheService; /** * 新增爬虫源 * */ @@ -38,7 +51,70 @@ public class CrawlController { return ResultBean.ok(crawlService.listCrawlByPage(page,pageSize)); } + /** + * 获取爬虫源 + * */ + @GetMapping("getCrawlSource/{id}") + public ResultBean getCrawlSource(@PathVariable("id") Integer id){ + CrawlSource crawlSource= crawlService.getCrawlSource(id); + return ResultBean.ok(crawlSource); + } + + /** + * 测试规则 + * @param rule + * @param url + * @param isRefresh + * @return + */ + @PostMapping("testParse") + public ResultBean testParse(String rule,String url,String isRefresh){ + + Map resultMap=new HashMap<>(); + String html =null; + if(url.startsWith("https://")||url.startsWith("http://")){ + String refreshCache="1"; + if(!refreshCache.equals(isRefresh)) { + Object cache = cacheService.getObject(CacheKey.BOOK_TEST_PARSE + url); + if (cache == null) { + isRefresh="1"; + }else { + html = (String) cache; + } + } + if(refreshCache.equals(isRefresh)){ + html = HttpUtil.getByHttpClientWithChrome(url); + if (html != null) { + cacheService.setObject(CacheKey.BOOK_TEST_PARSE + url, html, 60 * 10); + }else{ + resultMap.put("msg","html is null"); + return ResultBean.ok(resultMap); + } + } + }else{ + resultMap.put("html","url is null"); + return ResultBean.ok(resultMap); + } + Pattern pattern = Pattern.compile(rule); + Matcher matcher = pattern.matcher(html); + boolean isFind = matcher.find(); + resultMap.put("是否匹配",isFind); + if(isFind){ + resultMap.put("匹配结果",matcher.group(1)); + } + // resultMap.put("url",url); + return ResultBean.ok(resultMap); + } + /** + * 修改爬虫源 + * */ + @PostMapping("updateCrawlSource") + public ResultBean updateCrawlSource(CrawlSource source){ + crawlService.updateCrawlSource(source); + return ResultBean.ok(); + + } /** * 开启或停止爬虫 * */ diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/config/SecurityConfiguration.java b/novel-crawl/src/main/java/com/java2nb/novel/core/config/SecurityConfiguration.java index 6b7dead..995705c 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/config/SecurityConfiguration.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/config/SecurityConfiguration.java @@ -50,7 +50,7 @@ public class SecurityConfiguration extends WebSecurityConfigurerAdapter { protected void configure(HttpSecurity http) throws Exception { http.csrf().disable()//禁用了 csrf 功能 .authorizeRequests()//限定签名成功的请求 - .antMatchers("/**").hasRole("ADMIN") + //.antMatchers("/**").hasRole("ADMIN") .anyRequest().permitAll()//其他没有限定的请求,允许访问 .and().anonymous()//对于没有配置权限的其他请求允许匿名访问 .and().formLogin()//使用 spring security 默认登录页面 diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java index d8e9821..a3b2365 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java @@ -1,6 +1,7 @@ package com.java2nb.novel.service; import com.java2nb.novel.core.bean.PageBean; +import com.java2nb.novel.core.bean.ResultBean; import com.java2nb.novel.core.crawl.RuleBean; import com.java2nb.novel.entity.CrawlSingleTask; import com.java2nb.novel.entity.CrawlSource; @@ -18,7 +19,11 @@ public interface CrawlService { * */ void addCrawlSource(CrawlSource source); - + /** + * 修改爬虫源 + * @param source + */ + void updateCrawlSource(CrawlSource source); /** * 爬虫源分页列表 * @param page 当前页码 @@ -106,4 +111,11 @@ public interface CrawlService { * @param status 采集状态 * */ void updateCrawlSingleTask(CrawlSingleTask task, Byte status); + + /** + * 获取采集规则详细 + * @param id + * @return + */ + CrawlSource getCrawlSource(Integer id); } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index c9492cc..0421f12 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -39,6 +39,7 @@ import java.util.regex.Pattern; import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClientWithChrome; import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*; +import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.id; import static org.mybatis.dynamic.sql.SqlBuilder.isEqualTo; import static org.mybatis.dynamic.sql.select.SelectDSL.select; @@ -69,7 +70,24 @@ public class CrawlServiceImpl implements CrawlService { crawlSourceMapper.insertSelective(source); } - + @Override + public void updateCrawlSource(CrawlSource source) { + if(source.getId()!=null){ + Optional opt=crawlSourceMapper.selectByPrimaryKey(source.getId()); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + if (crawlSource.getSourceStatus() == (byte) 1) { + //关闭 + openOrCloseCrawl(crawlSource.getId(),(byte)0); + } + Date currentDate = new Date(); + crawlSource.setUpdateTime(currentDate); + crawlSource.setCrawlRule(source.getCrawlRule()); + crawlSource.setSourceName(source.getSourceName()); + crawlSourceMapper.updateByPrimaryKey(crawlSource); + } + } + } @Override public PageBean listCrawlByPage(int page, int pageSize) { PageHelper.startPage(page, pageSize); @@ -138,12 +156,17 @@ public class CrawlServiceImpl implements CrawlService { @Override public CrawlSource queryCrawlSource(Integer sourceId) { - SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule) + + SelectStatementProvider render = select(id, sourceName, sourceStatus, createTime, updateTime,crawlRule) .from(crawlSource) .where(id, isEqualTo(sourceId)) .build() .render(RenderingStrategies.MYBATIS3); - return crawlSourceMapper.selectMany(render).get(0); + List list= crawlSourceMapper.selectMany(render); + if(list!=null&&list.size()>0){ + return list.get(0); + } + return null; } @Override @@ -205,6 +228,16 @@ public class CrawlServiceImpl implements CrawlService { } + @Override + public CrawlSource getCrawlSource(Integer id) { + Optional opt=crawlSourceMapper.selectByPrimaryKey(id); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + return crawlSource; + } + return null; + } + /** * 解析分类列表 */ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html index b247301..f478835 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html @@ -30,6 +30,7 @@ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html index 786af82..a64b5d0 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html @@ -30,6 +30,7 @@ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 02833f4..7ee95d1 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -30,6 +30,7 @@ diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html index d054411..1a7dc3f 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html @@ -29,6 +29,7 @@ @@ -38,7 +39,7 @@

爬虫源列表

- +
@@ -119,8 +120,8 @@ + + + + + diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html new file mode 100644 index 0000000..7d2443a --- /dev/null +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html @@ -0,0 +1,522 @@ + + + + + + + 爬虫管理系统-小说精品屋 + + + + + + +
+ +
+ +
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + +
+
+
+

爬虫源信息填写(示例均为顶点小说网:dingdiann.com)

+
    +
  • + 示例:新顶点小说网 +
  • + + 示例:http://m.xdingdiann.com/sort/{catId}/{page}.html ({catId}代表分类ID,{page}代表分页页码) +
  • + 示例:1 +
  • + 示例:2 +
  • + 示例:3 +
  • + 示例:4 +
  • + 示例:5 +
  • + 示例:6 +
  • + 示例:7 +
  • + 示例:href="/ddk(\d+)/" +
  • + value="(\d+)/\d+" +
  • + value="\d+/(\d+)" +
  • + http://m.xdingdiann.com/ddk{bookId} (bookId代表小说ID) +
  • + 示例:<p class="title">([^/]+)</p> +
  • + 示例:作者:([^/]+)< +
  • + 示例:<img src="([^>]+)"\s+onerror="this.src= +
  • + 可空,适用于图片路径为相对路径的源站,加上小说图片路径,则为完整的可访问的图片路径 +
  • + 示例:状态:([^/]+)</li> +
  • + 示例:连载 +
  • + 示例:完结 +
  • + 示例:<div\s+class="score">(\d+\.\d+)分</div> +
  • + 示例: +
  • + 示例:<p class="review"> +
  • + 示例:</p> +
  • +
  • + 示例:更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a> +
  • + 示例:yyyy-MM-dd HH:mm:ss +
  • + 示例:http://m.xdingdiann.com/ddk{bookId}/all.html (bookId代表小说ID) +
  • + 可空,适用于最新章节列表和全部章节列表在同一个页面的源站 +
  • + 示例:<a\s+style=""\s+href="/ddk\d+/(\d+)\.html">[^/]+</a> +
  • + 示例:<a\s+style=""\s+href="/ddk\d+/\d+\.html">([^/]+)</a> +
  • + 示例:http://m.xdingdiann.com/ddk{bookId}/{indexId}.html + (bookId代表小说ID,{indexId}代表目录ID) +
  • + 示例:id="content"> +
  • + 示例:<script> +
  • + +
  • +
+
+
+
+ +
+
+
+
+ + + + + + + + + diff --git a/pom.xml b/pom.xml index 4b46641..9561928 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ 1.8 true 8.0.11 - 1.3.2 + 2.1.4 1.4.0 1.1.4 1.2.5 @@ -112,4 +112,4 @@ - \ No newline at end of file +