feat(crawl): 新增编辑规则和测试规则

合并mstaer分支的pull request #71
This commit is contained in:
xiaoyang 2021-12-24 17:38:23 +08:00
parent fecf03b3f5
commit 96662fcb17
11 changed files with 826 additions and 7 deletions

View File

@ -106,7 +106,7 @@
<dependency>
<groupId>org.apache.shiro</groupId>
<artifactId>shiro-spring</artifactId>
<version>1.3.2</version>
<version>1.7.0</version>
</dependency>
<!-- shiro ehcache -->
<dependency>

View File

@ -65,4 +65,8 @@ public interface CacheKey {
* 累积的小说点击量
* */
String BOOK_ADD_VISIT_COUNT = "bookAddVisitCount";
}
/**
* 测试爬虫规则缓存
*/
String BOOK_TEST_PARSE = "testParse";
}

View File

@ -1,5 +1,8 @@
package com.java2nb.novel.controller;
import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.utils.HttpUtil;
import io.github.xxyopen.model.page.PageBean;
import com.java2nb.novel.entity.CrawlSingleTask;
@ -9,6 +12,11 @@ import io.github.xxyopen.model.resp.RestResult;
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.*;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Administrator
*/
@ -19,7 +27,7 @@ public class CrawlController {
private final CrawlService crawlService;
private final CacheService cacheService;
/**
* 新增爬虫源
* */
@ -39,7 +47,70 @@ public class CrawlController {
return RestResult.ok(crawlService.listCrawlByPage(page,pageSize));
}
/**
* 获取爬虫源
* */
@GetMapping("getCrawlSource/{id}")
public RestResult<CrawlSource> getCrawlSource(@PathVariable("id") Integer id){
CrawlSource crawlSource= crawlService.getCrawlSource(id);
return RestResult.ok(crawlSource);
}
/**
* 测试规则
* @param rule
* @param url
* @param isRefresh
* @return
*/
@PostMapping("testParse")
public RestResult<Object> testParse(String rule,String url,String isRefresh){
Map<String,Object> resultMap=new HashMap<>();
String html =null;
if(url.startsWith("https://")||url.startsWith("http://")){
String refreshCache="1";
if(!refreshCache.equals(isRefresh)) {
Object cache = cacheService.getObject(CacheKey.BOOK_TEST_PARSE + url);
if (cache == null) {
isRefresh="1";
}else {
html = (String) cache;
}
}
if(refreshCache.equals(isRefresh)){
html = HttpUtil.getByHttpClientWithChrome(url);
if (html != null) {
cacheService.setObject(CacheKey.BOOK_TEST_PARSE + url, html, 60 * 10);
}else{
resultMap.put("msg","html is null");
return RestResult.ok(resultMap);
}
}
}else{
resultMap.put("html","url is null");
return RestResult.ok(resultMap);
}
Pattern pattern = Pattern.compile(rule);
Matcher matcher = pattern.matcher(html);
boolean isFind = matcher.find();
resultMap.put("是否匹配",isFind);
if(isFind){
resultMap.put("匹配结果",matcher.group(1));
}
// resultMap.put("url",url);
return RestResult.ok(resultMap);
}
/**
* 修改爬虫源
* */
@PostMapping("updateCrawlSource")
public RestResult<Void> updateCrawlSource(CrawlSource source) {
crawlService.updateCrawlSource(source);
return RestResult.ok();
}
/**
* 开启或停止爬虫
* */

View File

@ -18,7 +18,11 @@ public interface CrawlService {
* */
void addCrawlSource(CrawlSource source);
/**
* 修改爬虫源
* @param source
*/
void updateCrawlSource(CrawlSource source);
/**
* 爬虫源分页列表
* @param page 当前页码
@ -106,4 +110,11 @@ public interface CrawlService {
* @param status 采集状态
* */
void updateCrawlSingleTask(CrawlSingleTask task, Byte status);
/**
* 获取采集规则详细
* @param id
* @return
*/
CrawlSource getCrawlSource(Integer id);
}

View File

@ -70,7 +70,24 @@ public class CrawlServiceImpl implements CrawlService {
crawlSourceMapper.insertSelective(source);
}
@Override
public void updateCrawlSource(CrawlSource source) {
if(source.getId()!=null){
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(source.getId());
if(opt.isPresent()) {
CrawlSource crawlSource =opt.get();
if (crawlSource.getSourceStatus() == (byte) 1) {
//关闭
openOrCloseCrawl(crawlSource.getId(),(byte)0);
}
Date currentDate = new Date();
crawlSource.setUpdateTime(currentDate);
crawlSource.setCrawlRule(source.getCrawlRule());
crawlSource.setSourceName(source.getSourceName());
crawlSourceMapper.updateByPrimaryKey(crawlSource);
}
}
}
@Override
public PageBean<CrawlSource> listCrawlByPage(int page, int pageSize) {
PageHelper.startPage(page, pageSize);
@ -206,6 +223,16 @@ public class CrawlServiceImpl implements CrawlService {
}
@Override
public CrawlSource getCrawlSource(Integer id) {
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(id);
if(opt.isPresent()) {
CrawlSource crawlSource =opt.get();
return crawlSource;
}
return null;
}
/**
* 解析分类列表
*/

View File

@ -30,6 +30,7 @@
<ul class="log_list">
<li><a class="link_1" href="/">爬虫源管理</a></li>
<li><a class="link_1 on" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>

View File

@ -30,6 +30,7 @@
<ul class="log_list">
<li><a class="link_1" href="/">爬虫源管理</a></li>
<li><a class="link_1 on" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!-- <li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>

View File

@ -30,6 +30,7 @@
<ul class="log_list">
<li><a class="link_1 on" href="/">爬虫源管理</a></li>
<li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>

View File

@ -29,6 +29,7 @@
<ul class="log_list">
<li><a class="link_1 on" href="/">爬虫源管理</a></li>
<li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!-- <li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>
@ -38,7 +39,7 @@
<div class="my_bookshelf">
<div class="title cf">
<h2 class="fl">爬虫源列表</h2>
<div class="fr"><a href="/crawl/crawlSource_add.html" class="btn_red">增加爬虫源</a></div>
<div class="fr"><a href="/crawl/crawlSource_add.html" class="btn_red">增加爬虫源</a>
</div>
<div id="divData" class="updateTable">
@ -119,6 +120,7 @@
<script language="javascript" type="text/javascript">
search(1, 10);
var pageCrawlSourceList=null;
function search(curr, limit) {
$.ajax({
@ -129,6 +131,7 @@
success: function (data) {
if (data.code == 200) {
var crawlSourceList = data.data.list;
pageCrawlSourceList=data.data.list;
if (crawlSourceList.length > 0) {
var crawlSourceListHtml = "";
for(var i=0;i<crawlSourceList.length;i++){
@ -147,7 +150,9 @@
" <td class=\"goread\" id='sourceStatus"+crawlSource.id+"'>"+(crawlSource.sourceStatus==0?'停止运行':'正在运行')+
" </td>\n" +
" <td class=\"goread\" id='opt"+crawlSource.id+"'><a href='javascript:openOrStopCrawl("+crawlSource.id+","+crawlSource.sourceStatus+")'>"+(crawlSource.sourceStatus==0?'开启':'关闭')+" </a></td> </tr>");
" <td class=\"goread\" id='opt"+crawlSource.id+"'><a href='javascript:openOrStopCrawl("+crawlSource.id+","+crawlSource.sourceStatus+")'>"+(crawlSource.sourceStatus==0?'开启':'关闭')+" </a>" +
"<a href='javascript:updateCrawlSource("+crawlSource.id+")'>修改 </a>" +
"</td> </tr>");
}
$("#crawlSourceList").html(crawlSourceListHtml);
@ -196,7 +201,12 @@
})
}
function updateCrawlSource(crawlSourceId){
localStorage.setItem("crawlSourceId",crawlSourceId);
window.location.href="/crawl/crawlSource_update.html";
}
function openOrStopCrawl(sourceId,status) {

View File

@ -0,0 +1,171 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>
<title>爬虫管理系统-小说精品屋</title>
<link rel="stylesheet" href="/css/base.css?v=1"/>
<link rel="stylesheet" href="/css/user.css"/>
</head>
</head>
<body class="">
<div class="header">
<div class="mainNav" id="mainNav">
<div class="box_center cf"
style="text-align: center;height: 44px;line-height: 48px;color: #fff;font-size: 16px;">
小说精品屋爬虫管理
</div>
</div>
</div>
<div class="main box_center cf">
<div class="userBox cf">
<div class="my_l">
<ul class="log_list">
<li><a class="link_1 on" href="/">爬虫源管理</a></li>
<li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>
</div>
<div class="my_r">
<div class="my_bookshelf">
<div class="userBox cf">
<form method="post" action="./register.html" id="form2">
<div class="aspNetHidden">
<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE"
value="/wEPDwUKLTIzNjMxNDQxNw9kFgJmD2QWAmYPFgIeBFRleHQFqAE8YSBocmVmPSIvc2VhcmNoLmFzcHg/c2VhcmNoS2V5PeWWu+Wuiembr++8jOeLhOazve+8jOeBteW8gu+8jOWJjeS4luS7iueUn++8jOWGpeeOi+msvOWkqyIgdGFyZ2V0PSJfYmxhbmsiPuWWu+Wuiembr++8jOeLhOazve+8jOeBteW8gu+8jOWJjeS4luS7iueUn++8jOWGpeeOi+msvOWkqzwvYT5kZOquoASBvnvPbc/TYIQiLhSPJ8GKnYQrmk7jGhb5AC5Q">
</div>
<div class="aspNetHidden">
<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="23AA6834">
<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION"
value="/wEdAAVece19BIZ9HiByRfHz3pfnqKSXUE1UN51mNFrIuw38c3Y2+Mc6SrnAqio3oCKbxYZZ1lS+gZUZKpbsAea8j7ASAv40DHFcQ/NE7tJUnABeyQ3d9sFDIcFCYNqlVtprfLoh4JFy0U+R/CcMuyAiWTz7">
</div>
<div class="user_l">
<div></div>
<h3>爬虫源信息填写示例均为顶点小说网dingdiann.com</h3>
<ul class="log_list">
<li><span id="LabErr"></span></li>
示例<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID{page}代表分页页码)
<li><input type="text" id="url" class="s_input icon_key"
placeholder="url"></li>
示例<b>value=\"(\\d+)/\\d+\"</b>
<li><input type="text" id="rule" class="s_input icon_name" placeholder="规则"></li>
示例<b>1强制刷新 空或0使用缓存</b>
<li><input type="text" id="isRefresh" class="s_input icon_name" placeholder="是否强制刷新"></li>
<li><textarea rows="20" cols="100" id="resultMap"></textarea></li>
<li><input type="button" onclick="testCrawlSource()" name="btnRegister" value="测试"
id="btnRegister" class="btn_red"></li>
</ul>
</div>
</form>
</div>
<!--<div id="divData" class="updateTable">
<table cellpadding="0" cellspacing="0">
<thead>
<tr>
<th class="name">
爬虫源已开启的爬虫源
</th>
<th class="chapter">
成功爬取数量websocket实现
</th>
<th class="time">
目标爬取数量
</th>
<th class="goread">
状态正在运行已停止一次只能运行一个爬虫源
</th>
<th class="goread">
操作启动停止
</th>
</tr>
</thead>
<tbody id="bookShelfList">
</tbody>
</table>
<div class="pageBox cf" id="shellPage">
</div>
</div>-->
</div>
</div>
</div>
</div>
</body>
<script src="/javascript/jquery-1.8.0.min.js" type="text/javascript"></script>
<script src="/layui/layui.all.js" type="text/javascript"></script>
<script src="/javascript/header.js" type="text/javascript"></script>
<script src="/javascript/user.js" type="text/javascript"></script>
<script language="javascript" type="text/javascript">
function load() {
var testParseUrl= localStorage.getItem("testParseUrl");
$("#url").val(testParseUrl);
var testParseRule=localStorage.getItem("testParseRule");
$("#rule").val(testParseRule);
}
function testCrawlSource() {
var data = {};
var isRefresh = $("#isRefresh").val();
data.isRefresh = isRefresh;
var rule = $("#rule").val();
if (rule.length == 0) {
layer.alert("正则必填");
return false;
}
data.rule = rule;
var url = $("#url").val();
if (url.length == 0) {
layer.alert("url必填");
return false;
}
data.url = url;
localStorage.setItem("testParseUrl",url);
localStorage.setItem("testParseRule",rule);
$.ajax({
type: "POST",
url: "/crawl/testParse",
data: data,
dataType: "json",
success: function (data) {
if (data.code == 200) {
$("#resultMap").val(JSON.stringify(data.data));
} else {
layer.alert(data.msg);
}
},
error: function () {
layer.alert('网络异常');
}
})
}
</script>
</html>

View File

@ -0,0 +1,522 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>
<title>爬虫管理系统-小说精品屋</title>
<link rel="stylesheet" href="/css/base.css?v=1"/>
<link rel="stylesheet" href="/css/user.css"/>
</head>
</head>
<body class="">
<div class="header">
<div class="mainNav" id="mainNav">
<div class="box_center cf"
style="text-align: center;height: 44px;line-height: 48px;color: #fff;font-size: 16px;">
小说精品屋爬虫管理
</div>
</div>
</div>
<div class="main box_center cf">
<div class="userBox cf">
<div class="my_l">
<ul class="log_list">
<li><a class="link_1 on" href="/">爬虫源管理</a></li>
<li><a class="link_1" href="/crawl/crawlSingleTask_list.html">单本采集管理</a></li>
<li><a class="link_1" href="/crawl/crawlSource_test.html" target="_blank" >规则测试</a></li>
<!--<li><a class="link_1 " href="/user/userinfo.html">批量小说爬取</a></li>
<li><a class="link_4 " href="/user/favorites.html">单本小说爬取</a></li>-->
</ul>
</div>
<div class="my_r">
<div class="my_bookshelf">
<div class="userBox cf">
<form method="post" action="./register.html" id="form2">
<input type="hidden" name="id" id="sourceId"/>
<div class="aspNetHidden">
<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE"
value="/wEPDwUKLTIzNjMxNDQxNw9kFgJmD2QWAmYPFgIeBFRleHQFqAE8YSBocmVmPSIvc2VhcmNoLmFzcHg/c2VhcmNoS2V5PeWWu+Wuiembr++8jOeLhOazve+8jOeBteW8gu+8jOWJjeS4luS7iueUn++8jOWGpeeOi+msvOWkqyIgdGFyZ2V0PSJfYmxhbmsiPuWWu+Wuiembr++8jOeLhOazve+8jOeBteW8gu+8jOWJjeS4luS7iueUn++8jOWGpeeOi+msvOWkqzwvYT5kZOquoASBvnvPbc/TYIQiLhSPJ8GKnYQrmk7jGhb5AC5Q">
</div>
<div class="aspNetHidden">
<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="23AA6834">
<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION"
value="/wEdAAVece19BIZ9HiByRfHz3pfnqKSXUE1UN51mNFrIuw38c3Y2+Mc6SrnAqio3oCKbxYZZ1lS+gZUZKpbsAea8j7ASAv40DHFcQ/NE7tJUnABeyQ3d9sFDIcFCYNqlVtprfLoh4JFy0U+R/CcMuyAiWTz7">
</div>
<div class="user_l">
<div></div>
<h3>爬虫源信息填写示例均为顶点小说网dingdiann.com</h3>
<ul class="log_list">
<li><span id="LabErr"></span></li>
示例<b>新顶点小说网</b>
<li><input type="text" id="sourceName" class="s_input icon_name" placeholder="源站名"></li>
<!--示例<b>https://m.xdingdiann.com/sort/0/1.html</b>
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
placeholder="小说更新列表url"></li>-->
示例<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID{page}代表分页页码)
<li><input type="text" id="bookListUrl" class="s_input icon_key"
placeholder="分类列表页URL规则"></li>
示例<b>1</b>
<li><input type="text" id="catId1" class="s_input icon_key" placeholder="玄幻奇幻分类ID"></li>
示例<b>2</b>
<li><input type="text" id="catId2" class="s_input icon_key" placeholder="武侠仙侠分类ID"></li>
示例<b>3</b>
<li><input type="text" id="catId3" class="s_input icon_key" placeholder="都市言情分类ID"></li>
示例<b>4</b>
<li><input type="text" id="catId4" class="s_input icon_key" placeholder="历史军事分类ID"></li>
示例<b>5</b>
<li><input type="text" id="catId5" class="s_input icon_key" placeholder="科幻灵异分类ID"></li>
示例<b>6</b>
<li><input type="text" id="catId6" class="s_input icon_key" placeholder="网游竞技分类ID"></li>
示例<b>7</b>
<li><input type="text" id="catId7" class="s_input icon_key" placeholder="女生频道分类ID"></li>
示例<b>href="/ddk(\d+)/"</b>
<li><input type="text" id="bookIdPatten" class="s_input icon_key"
placeholder="列表页小说ID正则表达式"></li>
<b>value="(\d+)/\d+"</b>
<li><input type="text" id="pagePatten" class="s_input icon_key"
placeholder="列表页当前分页页码正则表达式:"></li>
<b>value="\d+/(\d+)"</b>
<li><input type="text" id="totalPagePatten" class="s_input icon_key"
placeholder="列表页分页总页数正则表达式:"></li>
<b>http://m.xdingdiann.com/ddk{bookId}</b> (bookId代表小说ID)
<li><input type="text" id="bookDetailUrl" class="s_input icon_key"
placeholder="详情页URL规则"></li>
示例<b>&lt;p class="title"&gt;([^/]+)&lt;/p&gt;</b>
<li><input type="text" id="bookNamePatten" class="s_input icon_key"
placeholder="小说名的正则表达式:"></li>
示例<b>作者([^/]+)<</b>
<li><input type="text" id="authorNamePatten" class="s_input icon_key"
placeholder="小说作者的正则表达式:"></li>
示例<b>&lt;img src="([^>]+)"\s+onerror="this.src=</b>
<li><input type="text" id="picUrlPatten" class="s_input icon_key"
placeholder="小说图片路径的正则表达式:"></li>
<b>可空适用于图片路径为相对路径的源站加上小说图片路径则为完整的可访问的图片路径</b>
<li><input type="text" id="picUrlPrefix" class="s_input icon_key"
placeholder="小说图片访问路径前缀:"></li>
示例<b>状态([^/]+)&lt;/li&gt;</b>
<li><input type="text" id="statusPatten" class="s_input icon_key"
placeholder="小说状态的正则表达式:"></li>
示例<b>连载</b>
<li><input type="text" id="bookStatus0" class="s_input icon_key"
placeholder="连载中的小说在此网站的具体表现值:"></li>
示例<b>完结</b>
<li><input type="text" id="bookStatus1" class="s_input icon_key"
placeholder="全本小说在此网站的具体表现值:"></li>
示例<b>&lt;div\s+class="score"&gt;(\d+\.\d+)分&lt;/div&gt;</b>
<li><input type="text" id="scorePatten" class="s_input icon_key"
placeholder="小说评分的正则表达式:"></li>
示例<b></b>
<li><input type="text" id="visitCountPatten" class="s_input icon_key"
placeholder="小说点击量的正则表达式:"></li>
示例<b>&lt;p class="review"&gt;</b>
<li><input type="text" id="descStart" class="s_input icon_key"
placeholder="小说简介开始截取字符串:"></li>
示例<b>&lt;/p&gt;</b>
<li><input type="text" id="descEnd" class="s_input icon_key" placeholder="小说简介结束截取字符串">
</li>
示例<b>更新(\d+-\d+-\d+\s\d+:\d+:\d+)&lt;/a&gt;</b>
<li><input type="text" id="upadateTimePatten" class="s_input icon_key"
placeholder="小说更新时间的正则表达式:"></li>
示例<b>yyyy-MM-dd HH:mm:ss</b>
<li><input type="text" id="upadateTimeFormatPatten" class="s_input icon_key"
placeholder="小说更新时间在此网站的显示模式:"></li>
示例<b>http://m.xdingdiann.com/ddk{bookId}/all.html</b> (bookId代表小说ID)
<li><input type="text" id="bookIndexUrl" class="s_input icon_key"
placeholder="小说目录页的URL规则"></li>
<b>可空适用于最新章节列表和全部章节列表在同一个页面的源站</b>
<li><input type="text" id="bookIndexStart" class="s_input icon_key"
placeholder="小说目录页内容开始截取字符串:"></li>
示例<b>&lt;a\s+style=""\s+href="/ddk\d+/(\d+)\.html"&gt;[^/]+&lt;/a&gt;</b>
<li><input type="text" id="indexIdPatten" class="s_input icon_key"
placeholder="目录页目录ID正则表达式"></li>
示例<b>&lt;a\s+style=""\s+href="/ddk\d+/\d+\.html"&gt;([^/]+)&lt;/a&gt;</b>
<li><input type="text" id="indexNamePatten" class="s_input icon_key"
placeholder="目录页目录名的正则表达式:"></li>
示例<b>http://m.xdingdiann.com/ddk{bookId}/{indexId}.html</b>
(bookId代表小说ID,{indexId}代表目录ID)
<li><input type="text" id="bookContentUrl" class="s_input icon_key"
placeholder="小说内容页的URL规则"></li>
示例<b>id="content"></b>
<li><input type="text" id="contentStart" class="s_input icon_key"
placeholder="小说内容开始截取字符串:"></li>
示例<b>&lt;script&gt;</b>
<li><input type="text" id="contentEnd" class="s_input icon_key"
placeholder="小说内容结束截取字符串:"></li>
<li><input type="button" onclick="updateCrawlSource()" name="btnRegister" value="提交"
id="btnRegister" class="btn_red"></li>
</ul>
</div>
</form>
</div>
<!--<div id="divData" class="updateTable">
<table cellpadding="0" cellspacing="0">
<thead>
<tr>
<th class="name">
爬虫源已开启的爬虫源
</th>
<th class="chapter">
成功爬取数量websocket实现
</th>
<th class="time">
目标爬取数量
</th>
<th class="goread">
状态正在运行已停止一次只能运行一个爬虫源
</th>
<th class="goread">
操作启动停止
</th>
</tr>
</thead>
<tbody id="bookShelfList">
</tbody>
</table>
<div class="pageBox cf" id="shellPage">
</div>
</div>-->
</div>
</div>
</div>
</div>
</body>
<script src="/javascript/jquery-1.8.0.min.js" type="text/javascript"></script>
<script src="/layui/layui.all.js" type="text/javascript"></script>
<script src="/javascript/header.js" type="text/javascript"></script>
<script src="/javascript/user.js" type="text/javascript"></script>
<script language="javascript" type="text/javascript">
function load(){
var crawlSourceId = localStorage.getItem("crawlSourceId")
if(crawlSourceId!=null){
$.ajax({
type: "GET",
url: "/crawl/getCrawlSource/"+crawlSourceId,
dataType: "json",
success: function (data) {
if (data.code == 200) {
loadPage(data.data);
} else if (data.code == 1001) {
//未登录
location.href = '/user/login.html?originUrl=' + decodeURIComponent(location.href);
}else {
layer.alert(data.msg);
}
},
error: function () {
layer.alert('网络异常');
}
})
}
}
function loadPage(data){
$("#sourceId").val(data.id);
$("#sourceName").val(data.sourceName);
if(data.crawlRule){
var crawlRule= JSON.parse(data.crawlRule);
$("#bookListUrl").val(crawlRule.bookListUrl);
var catIdRule = crawlRule.catIdRule;
try{
for (var i = 1; i <= 7; i++) {
$("#catId" + i).val(catIdRule["catId" + i]);
}
}catch(e){
}
$("#bookIdPatten").val(crawlRule.bookIdPatten);
$("#pagePatten").val(crawlRule.pagePatten);
$("#totalPagePatten").val(crawlRule.totalPagePatten);
$("#bookDetailUrl").val(crawlRule.bookDetailUrl);
$("#bookNamePatten").val(crawlRule.bookNamePatten);
$("#authorNamePatten").val(crawlRule.authorNamePatten);
$("#picUrlPatten").val(crawlRule.picUrlPatten);
$("#picUrlPrefix").val(crawlRule.picUrlPrefix);
$("#statusPatten").val(crawlRule.statusPatten);
try{
var bookStatusRule = crawlRule.bookStatusRule;
var i=0;
for(var key in bookStatusRule){
$("#bookStatus" + i).val(key);
i++;
}
}catch (e) {
}
$("#scorePatten").val(crawlRule.scorePatten);
$("#visitCountPatten").val(crawlRule.visitCountPatten);
$("#descStart").val(crawlRule.descStart);
$("#descEnd").val(crawlRule.descEnd);
$("#upadateTimePatten").val(crawlRule.upadateTimePatten);
$("#upadateTimeFormatPatten").val(crawlRule.upadateTimeFormatPatten);
$("#bookIndexUrl").val(crawlRule.bookIndexUrl);
$("#bookIndexStart").val(crawlRule.bookIndexStart);
$("#indexIdPatten").val(crawlRule.indexIdPatten);
$("#indexNamePatten").val(crawlRule.indexNamePatten);
$("#bookContentUrl").val(crawlRule.bookContentUrl);
$("#contentStart").val(crawlRule.contentStart);
$("#contentEnd").val(crawlRule.contentEnd);
}
}
load();
function updateCrawlSource() {
var crawlRule = {};
var sourceId =$("#sourceId").val();
var sourceName = $("#sourceName").val();
if (sourceName.length == 0) {
layer.alert("源站名必填");
return false;
}
var bookListUrl = $("#bookListUrl").val();
if (bookListUrl.length == 0) {
layer.alert("分类列表页URL规则必填");
return false;
}
crawlRule.bookListUrl = bookListUrl;
var catIdRule = {};
for (var i = 1; i <= 7; i++) {
var catId = $("#catId" + i).val();
if (catId.length > 0) {
catIdRule["catId" + i] = catId;
}
}
if (Object.keys(catIdRule).length == 0) {
layer.alert("分类ID至少要填一项");
return false;
}
crawlRule.catIdRule = catIdRule;
var bookIdPatten = $("#bookIdPatten").val();
if (bookIdPatten.length == 0) {
layer.alert("列表页小说ID正则表达式必填");
return false;
}
crawlRule.bookIdPatten = bookIdPatten;
var pagePatten = $("#pagePatten").val();
if (pagePatten.length > 0) {
crawlRule.pagePatten = pagePatten;
}
var totalPagePatten = $("#totalPagePatten").val();
if (totalPagePatten.length > 0) {
crawlRule.totalPagePatten = totalPagePatten;
}
var bookDetailUrl = $("#bookDetailUrl").val();
if (bookDetailUrl.length == 0) {
layer.alert("详情页URL规则必填");
return false;
}
crawlRule.bookDetailUrl = bookDetailUrl;
var bookNamePatten = $("#bookNamePatten").val();
if (bookNamePatten.length == 0) {
layer.alert("小说名的正则表达式必填");
return false;
}
crawlRule.bookNamePatten = bookNamePatten;
var authorNamePatten = $("#authorNamePatten").val();
if (authorNamePatten.length == 0) {
layer.alert("小说作者的正则表达式必填");
return false;
}
crawlRule.authorNamePatten = authorNamePatten;
var picUrlPatten = $("#picUrlPatten").val();
if (picUrlPatten.length > 0) {
crawlRule.picUrlPatten = picUrlPatten;
}
var picUrlPrefix = $("#picUrlPrefix").val();
if (picUrlPrefix.length > 0) {
crawlRule.picUrlPrefix = picUrlPrefix;
}
var statusPatten = $("#statusPatten").val();
if (statusPatten.length > 0) {
crawlRule.statusPatten = statusPatten;
}
var bookStatusRule = {};
for (var i = 0; i <= 1; i++) {
var bookStatus = $("#bookStatus" + i).val();
if (bookStatus.length > 0) {
bookStatusRule[bookStatus] = i;
}
}
crawlRule.bookStatusRule = bookStatusRule;
var scorePatten = $("#scorePatten").val();
if (scorePatten.length > 0) {
crawlRule.scorePatten = scorePatten;
}
var visitCountPatten = $("#visitCountPatten").val();
if (visitCountPatten.length > 0) {
crawlRule.visitCountPatten = visitCountPatten;
}
var descStart = $("#descStart").val();
if (descStart.length == 0) {
layer.alert("小说简介开始截取字符串必填");
return false;
}
crawlRule.descStart = descStart;
var descEnd = $("#descEnd").val();
if (descEnd.length == 0) {
layer.alert("小说简介结束截取字符串必填");
return false;
}
crawlRule.descEnd = descEnd;
var upadateTimePatten = $("#upadateTimePatten").val();
if (upadateTimePatten.length > 0) {
crawlRule.upadateTimePatten = upadateTimePatten;
}
var upadateTimeFormatPatten = $("#upadateTimeFormatPatten").val();
if (upadateTimeFormatPatten.length > 0) {
crawlRule.upadateTimeFormatPatten = upadateTimeFormatPatten;
}
var bookIndexUrl = $("#bookIndexUrl").val();
if (bookIndexUrl.length == 0) {
layer.alert("小说目录页的URL规则必填");
return false;
}
crawlRule.bookIndexUrl = bookIndexUrl;
var bookIndexStart = $("#bookIndexStart").val();
if (bookIndexStart.length > 0) {
crawlRule.bookIndexStart = bookIndexStart;
}
var indexIdPatten = $("#indexIdPatten").val();
if (indexIdPatten.length == 0) {
layer.alert("小说目录页的目录ID正则表达式必填");
return false;
}
crawlRule.indexIdPatten = indexIdPatten;
var indexNamePatten = $("#indexNamePatten").val();
if (indexNamePatten.length == 0) {
layer.alert("小说目录页的目录名正则表达式必填");
return false;
}
crawlRule.indexNamePatten = indexNamePatten;
var bookContentUrl = $("#bookContentUrl").val();
if (bookContentUrl.length == 0) {
layer.alert("小说内容页的URL规则必填");
return false;
}
crawlRule.bookContentUrl = bookContentUrl;
var contentStart = $("#contentStart").val();
if (contentStart.length == 0) {
layer.alert("小说内容开始截取字符串必填");
return false;
}
crawlRule.contentStart = contentStart;
var contentEnd = $("#contentEnd").val();
if (contentEnd.length == 0) {
layer.alert("小说内容结束截取字符串必填");
return false;
}
crawlRule.contentEnd = contentEnd;
$.ajax({
type: "POST",
url: "/crawl/updateCrawlSource",
data: {'id':sourceId,'sourceName': sourceName, 'crawlRule': JSON.stringify(crawlRule)},
dataType: "json",
success: function (data) {
if (data.code == 200) {
window.location.href = '/crawl/crawlSource_list.html';
} else {
layer.alert(data.msg);
}
},
error: function () {
layer.alert('网络异常');
}
})
}
</script>
</html>