diff --git a/novel-admin/pom.xml b/novel-admin/pom.xml index 0fed554..29d0b79 100644 --- a/novel-admin/pom.xml +++ b/novel-admin/pom.xml @@ -224,17 +224,17 @@ - + - - + –> org.springframework.boot spring-boot-maven-plugin + --> + + + + + org.springframework.boot + spring-boot-maven-plugin + + ZIP + + + non-exists + non-exists + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + + target/lib + false + false + runtime + + + + + + maven-antrun-plugin + 1.8 + + + package + + run + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + diff --git a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java index 75a585a..750c53f 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/cache/CacheKey.java @@ -65,4 +65,8 @@ public interface CacheKey { * 累积的小说点击量 * */ String BOOK_ADD_VISIT_COUNT = "bookAddVisitCount"; -} \ No newline at end of file + /** + * 测试爬虫规则缓存 + */ + String BOOK_TEST_PARSE = "testParse"; +} diff --git a/novel-common/src/main/java/com/java2nb/novel/entity/News.java b/novel-common/src/main/java/com/java2nb/novel/entity/News.java index c555349..437cfba 100644 --- a/novel-common/src/main/java/com/java2nb/novel/entity/News.java +++ b/novel-common/src/main/java/com/java2nb/novel/entity/News.java @@ -1,9 +1,11 @@ package com.java2nb.novel.entity; +import java.io.Serializable; import java.util.Date; import javax.annotation.Generated; -public class News { +public class News implements Serializable { + @Generated("org.mybatis.generator.api.MyBatisGenerator") private Long id; @@ -146,4 +148,4 @@ public class News { public void setContent(String content) { this.content = content == null ? null : content.trim(); } -} \ No newline at end of file +} diff --git a/novel-crawl/pom.xml b/novel-crawl/pom.xml index 10710b9..cb2047d 100644 --- a/novel-crawl/pom.xml +++ b/novel-crawl/pom.xml @@ -32,11 +32,82 @@ + org.springframework.boot spring-boot-maven-plugin + + ZIP + + + non-exists + non-exists + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + + target/lib + false + false + runtime + + + + + + maven-antrun-plugin + 1.8 + + + package + + run + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - \ No newline at end of file + diff --git a/novel-crawl/src/main/build/scripts/readme.txt b/novel-crawl/src/main/build/scripts/readme.txt new file mode 100644 index 0000000..a3bab7e --- /dev/null +++ b/novel-crawl/src/main/build/scripts/readme.txt @@ -0,0 +1,8 @@ +1:linux启动环境 +sh start.sh + +3:windows启动环境 +windows-start.bat + +3:linux停止应用 +sh stop.sh diff --git a/novel-crawl/src/main/build/scripts/start.sh b/novel-crawl/src/main/build/scripts/start.sh new file mode 100644 index 0000000..17f0acd --- /dev/null +++ b/novel-crawl/src/main/build/scripts/start.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +ENGINE=novel-crawl.jar +cd ../ + +#部署目路 +DEPLOY_DIR=`pwd` +#获取到当前目录的名称 +SERVER_NAME=`basename $DEPLOY_DIR` + +#应用进程 +PIDS=`ps -ef | grep java | grep "$ENGINE" |awk '{print $2}'` +#设置日志文件的输出目录 +LOGS_DIR=$DEPLOY_DIR/logs +if [ ! -d $LOGS_DIR ]; then + mkdir $LOGS_DIR +fi +#日志 +STDOUT_FILE=$LOGS_DIR/stdout.log +#JAVA 环境配置 +JAVA_OPTS=" -Djava.net.preferIPv4Stack=true -Dlog.home=$LOGS_DIR" + +JAVA_MEM_OPTS=" -server -Xms1024m -Xmx1024m -XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=256m -XX:+UseG1GC -XX:MaxGCPauseMillis=100 -XX:InitiatingHeapOccupancyPercent=50 -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCDateStamps -verbose:gc -XX:+PrintGCDetails -XX:+PrintHeapAtGC -Xloggc:$LOGS_DIR/gc.log -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=100M -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=dump.hprof " +#退出标志 +RETVAL="0" + +if [ -n "$PIDS" ]; then + echo "ERROR: The $SERVER_NAME already started!" + echo "PID: $PIDS" + exit $RETVAL +fi + +nohup java -jar $JAVA_OPTS $JAVA_MEM_OPTS -Dloader.path=conf,lib $ENGINE > $STDOUT_FILE 2>&1 & +COUNT=0 + while [ $COUNT -lt 1 ]; do + echo -e ".\c" + sleep 1 + COUNT=`ps -f | grep java | grep "$DEPLOY_DIR" | awk '{print $2}' | wc -l` + if [ $COUNT -gt 0 ]; then + break + fi + done + + echo "OK!" + PIDS=`ps -f | grep java | grep "$DEPLOY_DIR" | awk '{print $2}'` + echo "PID: $PIDS" + echo "STDOUT: $STDOUT_FILE" diff --git a/novel-crawl/src/main/build/scripts/stop.sh b/novel-crawl/src/main/build/scripts/stop.sh new file mode 100644 index 0000000..6cf8aed --- /dev/null +++ b/novel-crawl/src/main/build/scripts/stop.sh @@ -0,0 +1,33 @@ +#!/bin/bash +SERVER_NAME=novel-crawl.jar + +#应用进程 +PIDS=`ps -ef | grep java | grep "$SERVER_NAME" |awk '{print $2}'` +if [ -z "$PIDS" ]; then + echo "ERROR: The $SERVER_NAME does not started!" + exit 1 + fi + + echo -e "Stopping the $SERVER_NAME ...\c" + for PID in $PIDS ; do + kill $PID > /dev/null 2>&1 + done + + COUNT=0 + while [ $COUNT -lt 1 ]; do + echo -e ".\c" + sleep 1 + COUNT=1 + for PID in $PIDS ; do + PID_EXIST=`ps -f -p $PID | grep java` + if [ -n "$PID_EXIST" ]; then + COUNT=0 + break + fi + done + done + + echo "OK!" + echo "PID: $PIDS" + PIDS="" + diff --git a/novel-crawl/src/main/build/scripts/windows-start.bat b/novel-crawl/src/main/build/scripts/windows-start.bat new file mode 100644 index 0000000..90cfdb9 --- /dev/null +++ b/novel-crawl/src/main/build/scripts/windows-start.bat @@ -0,0 +1,10 @@ +@echo off +setlocal enabledelayedexpansion +set JAVA=java + +set OPTS=-XX:MetaspaceSize=128m -XX:MaxMetaspaceSize=128m -Xms1024m -Xmx1024m -Xmn256m -Xss256k -XX:SurvivorRatio=8 -XX:+UseConcMarkSweepGC +set ENGINE=novel-crawl.jar +cd ../ +java -jar %OPTS% -Dloader.path=conf,lib %ENGINE% +pause + diff --git a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java index 80c9c16..1c4cf42 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/controller/CrawlController.java @@ -1,13 +1,26 @@ package com.java2nb.novel.controller; +import com.fasterxml.jackson.databind.ObjectMapper; import com.java2nb.novel.core.bean.PageBean; import com.java2nb.novel.core.bean.ResultBean; +import com.java2nb.novel.core.cache.CacheKey; +import com.java2nb.novel.core.cache.CacheService; +import com.java2nb.novel.core.crawl.CrawlParser; +import com.java2nb.novel.core.crawl.RuleBean; +import com.java2nb.novel.core.utils.HttpUtil; +import com.java2nb.novel.entity.BookIndex; import com.java2nb.novel.entity.CrawlSingleTask; import com.java2nb.novel.entity.CrawlSource; import com.java2nb.novel.service.CrawlService; +import com.java2nb.novel.utils.Constants; import lombok.RequiredArgsConstructor; import org.springframework.web.bind.annotation.*; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * @author Administrator */ @@ -18,7 +31,7 @@ public class CrawlController { private final CrawlService crawlService; - + private final CacheService cacheService; /** * 新增爬虫源 * */ @@ -38,7 +51,70 @@ public class CrawlController { return ResultBean.ok(crawlService.listCrawlByPage(page,pageSize)); } + /** + * 获取爬虫源 + * */ + @GetMapping("getCrawlSource/{id}") + public ResultBean getCrawlSource(@PathVariable("id") Integer id){ + CrawlSource crawlSource= crawlService.getCrawlSource(id); + return ResultBean.ok(crawlSource); + } + + /** + * 测试规则 + * @param rule + * @param url + * @param isRefresh + * @return + */ + @PostMapping("testParse") + public ResultBean testParse(String rule,String url,String isRefresh){ + + Map resultMap=new HashMap<>(); + String html =null; + if(url.startsWith("https://")||url.startsWith("http://")){ + String refreshCache="1"; + if(!refreshCache.equals(isRefresh)) { + Object cache = cacheService.getObject(CacheKey.BOOK_TEST_PARSE + url); + if (cache == null) { + isRefresh="1"; + }else { + html = (String) cache; + } + } + if(refreshCache.equals(isRefresh)){ + html = HttpUtil.getByHttpClientWithChrome(url); + if (html != null) { + cacheService.setObject(CacheKey.BOOK_TEST_PARSE + url, html, 60 * 10); + }else{ + resultMap.put("msg","html is null"); + return ResultBean.ok(resultMap); + } + } + }else{ + resultMap.put("html","url is null"); + return ResultBean.ok(resultMap); + } + Pattern pattern = Pattern.compile(rule); + Matcher matcher = pattern.matcher(html); + boolean isFind = matcher.find(); + resultMap.put("是否匹配",isFind); + if(isFind){ + resultMap.put("匹配结果",matcher.group(1)); + } + // resultMap.put("url",url); + return ResultBean.ok(resultMap); + } + /** + * 修改爬虫源 + * */ + @PostMapping("updateCrawlSource") + public ResultBean updateCrawlSource(CrawlSource source){ + crawlService.updateCrawlSource(source); + return ResultBean.ok(); + + } /** * 开启或停止爬虫 * */ diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java index d8e9821..a3b2365 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/CrawlService.java @@ -1,6 +1,7 @@ package com.java2nb.novel.service; import com.java2nb.novel.core.bean.PageBean; +import com.java2nb.novel.core.bean.ResultBean; import com.java2nb.novel.core.crawl.RuleBean; import com.java2nb.novel.entity.CrawlSingleTask; import com.java2nb.novel.entity.CrawlSource; @@ -18,7 +19,11 @@ public interface CrawlService { * */ void addCrawlSource(CrawlSource source); - + /** + * 修改爬虫源 + * @param source + */ + void updateCrawlSource(CrawlSource source); /** * 爬虫源分页列表 * @param page 当前页码 @@ -106,4 +111,11 @@ public interface CrawlService { * @param status 采集状态 * */ void updateCrawlSingleTask(CrawlSingleTask task, Byte status); + + /** + * 获取采集规则详细 + * @param id + * @return + */ + CrawlSource getCrawlSource(Integer id); } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index c9492cc..0421f12 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -39,6 +39,7 @@ import java.util.regex.Pattern; import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClientWithChrome; import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*; +import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.id; import static org.mybatis.dynamic.sql.SqlBuilder.isEqualTo; import static org.mybatis.dynamic.sql.select.SelectDSL.select; @@ -69,7 +70,24 @@ public class CrawlServiceImpl implements CrawlService { crawlSourceMapper.insertSelective(source); } - + @Override + public void updateCrawlSource(CrawlSource source) { + if(source.getId()!=null){ + Optional opt=crawlSourceMapper.selectByPrimaryKey(source.getId()); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + if (crawlSource.getSourceStatus() == (byte) 1) { + //关闭 + openOrCloseCrawl(crawlSource.getId(),(byte)0); + } + Date currentDate = new Date(); + crawlSource.setUpdateTime(currentDate); + crawlSource.setCrawlRule(source.getCrawlRule()); + crawlSource.setSourceName(source.getSourceName()); + crawlSourceMapper.updateByPrimaryKey(crawlSource); + } + } + } @Override public PageBean listCrawlByPage(int page, int pageSize) { PageHelper.startPage(page, pageSize); @@ -138,12 +156,17 @@ public class CrawlServiceImpl implements CrawlService { @Override public CrawlSource queryCrawlSource(Integer sourceId) { - SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule) + + SelectStatementProvider render = select(id, sourceName, sourceStatus, createTime, updateTime,crawlRule) .from(crawlSource) .where(id, isEqualTo(sourceId)) .build() .render(RenderingStrategies.MYBATIS3); - return crawlSourceMapper.selectMany(render).get(0); + List list= crawlSourceMapper.selectMany(render); + if(list!=null&&list.size()>0){ + return list.get(0); + } + return null; } @Override @@ -205,6 +228,16 @@ public class CrawlServiceImpl implements CrawlService { } + @Override + public CrawlSource getCrawlSource(Integer id) { + Optional opt=crawlSourceMapper.selectByPrimaryKey(id); + if(opt.isPresent()) { + CrawlSource crawlSource =opt.get(); + return crawlSource; + } + return null; + } + /** * 解析分类列表 */ diff --git a/novel-crawl/src/main/resources/logback-boot.xml b/novel-crawl/src/main/resources/logback-boot.xml index 6be17f5..05c7c83 100644 --- a/novel-crawl/src/main/resources/logback-boot.xml +++ b/novel-crawl/src/main/resources/logback-boot.xml @@ -57,8 +57,8 @@ - + - \ No newline at end of file + diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html index b247301..f478835 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_add.html @@ -30,6 +30,7 @@ 爬虫源管理 单本采集管理 + 规则测试 diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html index 786af82..a64b5d0 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSingleTask_list.html @@ -30,6 +30,7 @@ 爬虫源管理 单本采集管理 + 规则测试 diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 02833f4..7ee95d1 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -30,6 +30,7 @@ 爬虫源管理 单本采集管理 + 规则测试 diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html index d054411..1a7dc3f 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html @@ -29,6 +29,7 @@ 爬虫源管理 单本采集管理 + 规则测试 @@ -38,7 +39,7 @@ 爬虫源列表 - 增加爬虫源 + 增加爬虫源 @@ -119,8 +120,8 @@ + + + + +