11 Commits

Author SHA1 Message Date
dc20ad42bd update 2020-04-15 13:32:26 +08:00
290522ef6d v2.3.0发布 2020-04-15 10:14:23 +08:00
56645720b3 增加百书斋源 2020-04-14 18:42:09 +08:00
xxy
f648a8e79e 优化 2020-02-07 12:22:50 +08:00
55635e098c 优化 2020-01-20 15:29:30 +08:00
16ac61e8ee 优化 2020-01-20 15:06:46 +08:00
548ab44f87 去除统计代码 2020-01-20 15:03:55 +08:00
d4230f32b0 更新bug修复 2020-01-20 10:55:46 +08:00
637b01e50b 优化 2020-01-18 11:37:35 +08:00
04ab2045f3 更新优化 2020-01-18 11:25:35 +08:00
bb83f5628b bug修复 2020-01-17 12:36:46 +08:00
20 changed files with 127 additions and 306 deletions

View File

@ -10,7 +10,7 @@
</parent> </parent>
<groupId>xyz.zinglizingli</groupId> <groupId>xyz.zinglizingli</groupId>
<artifactId>novel-front</artifactId> <artifactId>novel-front</artifactId>
<version>2.2.0.beta</version> <version>2.3.0.beta</version>
<name>novel-front</name> <name>novel-front</name>
<description>小说精品楼-前台web网站</description> <description>小说精品楼-前台web网站</description>

View File

@ -0,0 +1,29 @@
package xyz.zinglizingli.books.core.config;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Primary;
import xyz.zinglizingli.books.core.crawl.BaseHtmlCrawlSource;
import xyz.zinglizingli.books.core.crawl.BiquCrawlSource;
/**
* @author 11797
*/
@Slf4j
@Configuration
public class CrawlBaishuzhaiConfig {
@Bean
@Primary //必须加此注解,不然报错,下一个类则不需要添加
@ConfigurationProperties(prefix = "baishuzhai.crawlsource") // prefix值必须是application.yml中对应属性的前缀
@ConditionalOnProperty(prefix = "crawl.website",name = "type",havingValue = "4")
public BaseHtmlCrawlSource dingdianCrawlSource() {
return new BiquCrawlSource();
}
}

View File

@ -18,7 +18,6 @@ public class CrawlDingdianConfig {
@Bean @Bean
@Primary //必须加此注解,不然报错,下一个类则不需要添加
@ConfigurationProperties(prefix = "dingdian.crawlsource") // prefix值必须是application.yml中对应属性的前缀 @ConfigurationProperties(prefix = "dingdian.crawlsource") // prefix值必须是application.yml中对应属性的前缀
@ConditionalOnProperty(prefix = "crawl.website",name = "type",havingValue = "3") @ConditionalOnProperty(prefix = "crawl.website",name = "type",havingValue = "3")
public BaseHtmlCrawlSource dingdianCrawlSource() { public BaseHtmlCrawlSource dingdianCrawlSource() {

View File

@ -31,15 +31,8 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
@Override @Override
public void parse() { public void parse() {
Map<Integer,Date> cat2Date = bookService.queryLastUpdateTime(); for(int page = 1; page<= Constants.UPDATE_PAGES_ONCE; page++) {
Map<Integer,Date> newCat2Date = new HashMap<>(); String catBookListUrl = getListPageUrl().replace("{0}", "0").replace("{1}", page+"");
for(int i=1;i<=7;i++) {
Date lastUpdateTime = cat2Date.get(i);
Date updateTime = lastUpdateTime;
int page = 1;
do{
String catBookListUrl = getListPageUrl().replace("{0}", "0").replace("{1}", page + "");
page++;
String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl); String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl);
if (forObject != null) { if (forObject != null) {
//解析第一页书籍的数据 //解析第一页书籍的数据
@ -56,13 +49,9 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
Matcher bookNameMatch = bookNamePatten.matcher(forObject); Matcher bookNameMatch = bookNamePatten.matcher(forObject);
Pattern authorPatten = compile(getAuthorPattern());
Matcher authorMatch = authorPatten.matcher(forObject);
boolean isBookNameMatch = bookNameMatch.find(); boolean isBookNameMatch = bookNameMatch.find();
while (isFind && scoreFind && isBookNameMatch && authorMatch.find() && updateTime.getTime()>=lastUpdateTime.getTime()) { while (isFind && scoreFind && isBookNameMatch) {
try { try {
Float score = Float.parseFloat(scoreMatch.group(1)); Float score = Float.parseFloat(scoreMatch.group(1));
@ -76,30 +65,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
String bookName = bookNameMatch.group(1); String bookName = bookNameMatch.group(1);
String author = authorMatch.group(1);
Boolean hasBook = bookService.hasBook(bookName, author);
if (hasBook) {
bookService.addBookParseLog(bookUrl, bookName, score); bookService.addBookParseLog(bookUrl, bookName, score);
}
String body = RestTemplateUtil.getBodyByUtf8(bookUrl);
if (body != null) {
Pattern updateTimePatten = compile(getUpdateTimePattern());
Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
updateTime = format.parse(updateTimeStr);
if(!newCat2Date.containsKey(i)) {
newCat2Date.put(i, updateTime);
}
}
}
} catch (Exception e) { } catch (Exception e) {
@ -116,16 +82,13 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
} }
} }
}while (updateTime.getTime()>=lastUpdateTime.getTime());
} }
bookService.updateBookUpdateTimeLog(newCat2Date);
} }
@Override @Override
public void update() { public void update() {
List<BookParseLog> logs = bookService.queryBookParseLogs(); List<BookParseLog> logs = bookService.queryBookParseLogs();
List<Long> successLogIds = new ArrayList<>();
for (BookParseLog bookParseLog : logs) { for (BookParseLog bookParseLog : logs) {
try { try {
@ -158,10 +121,16 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
Pattern updateTimePatten = compile(getUpdateTimePattern()); Pattern updateTimePatten = compile(getUpdateTimePattern());
Matcher updateTimeMatch = updateTimePatten.matcher(body); Matcher updateTimeMatch = updateTimePatten.matcher(body);
if (updateTimeMatch.find()) { /*if (updateTimeMatch.find()) {
String updateTimeStr = updateTimeMatch.group(1); String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); SimpleDateFormat format ;
Date updateTime = format.parse(updateTimeStr); if(updateTimeStr.length()>10){
format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
}else{
format = new SimpleDateFormat("yy-MM-dd");
}
Date updateTime = format.parse(updateTimeStr);*/
Pattern picPatten = compile(getPicPattern()); Pattern picPatten = compile(getPicPattern());
Matcher picMather = picPatten.matcher(body); Matcher picMather = picPatten.matcher(body);
if (picMather.find()) { if (picMather.find()) {
@ -178,7 +147,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
book.setScore(score > 10 ? 8.0f : score); book.setScore(score > 10 ? 8.0f : score);
book.setPicUrl(picSrc); book.setPicUrl(picSrc);
book.setBookStatus(status); book.setBookStatus(status);
book.setUpdateTime(updateTime); book.setUpdateTime(new Date());
List<BookIndex> indexList = new ArrayList<>(); List<BookIndex> indexList = new ArrayList<>();
List<BookContent> contentList = new ArrayList<>(); List<BookContent> contentList = new ArrayList<>();
@ -241,7 +210,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
} }
} }
successLogIds.add(bookParseLog.getId()); bookService.deleteBookParseLog(bookParseLog.getId());
} }
@ -250,7 +219,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
} }
} //}
} }
} }
@ -266,7 +235,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
} }
} }
bookService.deleteBookParseLogs(successLogIds);
} }

View File

@ -66,7 +66,6 @@ public class StartListener implements ServletContextListener {
log.info("updateBooks执行中。。。。。。。。。。。。"); log.info("updateBooks执行中。。。。。。。。。。。。");
crawlSource.update(); crawlSource.update();
Thread.sleep(new Float(1000 * 60 * bookUpdatePeriod).longValue());
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
} }

View File

@ -124,7 +124,6 @@ public class BookService {
bookService.insertIndexListAndContentList(newBookIndexList, newContentList); bookService.insertIndexListAndContentList(newBookIndexList, newContentList);
} }
cacheUtil.del(CacheKeyConstans.NEWST_BOOK_LIST_KEY);
} }
@ -247,7 +246,7 @@ public class BookService {
example.createCriteria().andBookIdEqualTo(bookId).andIndexNumEqualTo(indexNum); example.createCriteria().andBookIdEqualTo(bookId).andIndexNumEqualTo(indexNum);
List<BookContent> bookContents = bookContentMapper.selectByExample(example); List<BookContent> bookContents = bookContentMapper.selectByExample(example);
content = bookContents.size() > 0 ? bookContents.get(0) : null; content = bookContents.size() > 0 ? bookContents.get(0) : null;
cacheUtil.setObject(CacheKeyConstans.BOOK_CONTENT_KEY_PREFIX + "_" + bookId + "_" + indexNum, content, 60 * 60 * 24); cacheUtil.setObject(CacheKeyConstans.BOOK_CONTENT_KEY_PREFIX + "_" + bookId + "_" + indexNum, content, 60 * 10);
} }
return content; return content;
@ -510,4 +509,12 @@ public class BookService {
} }
} }
} }
/**
* 删除已经成功更新的解析日志
* */
public void deleteBookParseLog(Long id) {
bookParseLogMapper.deleteByPrimaryKey(id);
}
} }

View File

@ -54,6 +54,23 @@ dingdian:
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a> catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a> catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>
baishuzhai:
crawlsource:
index-url: https://m.baishuzhai.com
list-page-url: https://m.baishuzhai.com/sort/{0}/{1}.html
book-url-pattern: href="/(ibook/\d+/\d+)/"
score-pattern: <div\s+class="score">(\d+\.\d+)分</div>
book-name-pattern: <p class="title">([^/]+)</p>
author-pattern: 作者:([^/]+)<
status-pattern: 状态:([^/]+)</li>
cat-pattern: 类别:([^/]+)</li>
update-time-pattern: 更新:(\d+-\d+-\d+)</li>
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
intro-pattern: class="review">([^/]+)</p>
catalog-url-pattern: <a\s+href="(/ibook/\d+/\d+/all\.html)">查看完整目录</a>
catalog-pattern: <a\s+style=""\s+href="(/ibook/\d+/\d+/\d+\.html)">([^/]+)</a>
biquge: biquge:
crawlsource: crawlsource:
index-url: http://m.biquge.info index-url: http://m.biquge.info

View File

@ -45,6 +45,19 @@ spring:
resources:
cache:
# 资源缓存时间,单位秒
period: 604800
# 开启gzip压缩
chain:
gzipped: true
# 启用缓存
cache: true
# mvc: # mvc:
# static-path-pattern: /static/** #设定静态文件路径js,css等 # static-path-pattern: /static/** #设定静态文件路径js,css等
mybatis: mybatis:
@ -71,10 +84,10 @@ books:
#小说的更新间隔(分) #小说的更新间隔(分)
updatePeriod: 1 updatePeriod: 1
#爬取的网站名称类型 1笔趣岛 2笔趣塔,3:顶点小说 更多网站解析中,敬请期待 #爬取的网站名称类型 1笔趣岛 2笔趣塔,3:顶点小说 4百书斋 更多网站解析中,敬请期待
crawl: crawl:
website: website:
type: 2 type: 4

View File

@ -19,17 +19,6 @@
<div th:include="common/css :: css"></div> <div th:include="common/css :: css"></div>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<script type="text/javascript"> <script type="text/javascript">
function reinitIframe(){ function reinitIframe(){
@ -477,16 +466,6 @@
</script> </script>
<script> <script>
(function () { (function () {
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
$("#content").css("min-height",($(window).height()-60)+"px"); $("#content").css("min-height",($(window).height()-60)+"px");

View File

@ -115,30 +115,6 @@
</style> </style>
<script>
var _hmt = _hmt || [];
(function() {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<script>
(function(){
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
</script>
<script src="/js/wap_collect.js"></script> <script src="/js/wap_collect.js"></script>
</head> </head>

View File

@ -39,29 +39,6 @@
</div> </div>
<script>
var _hmt = _hmt || [];
(function() {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<script>
(function(){
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
</script>
</head> </head>
<body> <body>

View File

@ -15,15 +15,6 @@
<div th:include="common/css :: css"></div> <div th:include="common/css :: css"></div>
</div> </div>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<style type="text/css"> <style type="text/css">
@ -236,18 +227,6 @@
</script> </script>
<script> <script>
(function(){
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
function toMyCollect(){ function toMyCollect(){

View File

@ -46,15 +46,6 @@
</style> </style>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
</head> </head>
<body> <body>
@ -247,17 +238,6 @@
<script src="/js/wap_collect.js"></script> <script src="/js/wap_collect.js"></script>
<script> <script>
lazyload(); lazyload();
(function () {
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
} else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
</script> </script>

View File

@ -46,15 +46,6 @@
</style> </style>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
</head> </head>
<body> <body>
@ -293,17 +284,6 @@
<script src="/js/wap_collect.js"></script> <script src="/js/wap_collect.js"></script>
<script> <script>
lazyload(); lazyload();
(function () {
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
} else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
</script> </script>

View File

@ -15,15 +15,6 @@
<div th:include="common/css :: css"></div> <div th:include="common/css :: css"></div>
</div> </div>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<style type="text/css"> <style type="text/css">
@ -247,16 +238,6 @@
</script> </script>
<script> <script>
(function(){ (function(){
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
var sortCat = $("#softCat").val(); var sortCat = $("#softCat").val();

View File

@ -15,15 +15,6 @@
<div th:include="common/css :: css"></div> <div th:include="common/css :: css"></div>
</div> </div>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<style type="text/css"> <style type="text/css">
@ -272,16 +263,6 @@
</script> </script>
<script> <script>
(function () { (function () {
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
var sortCat = $("#softCat").val(); var sortCat = $("#softCat").val();

View File

@ -50,29 +50,6 @@
</style> </style>
<script language="javascript" type="text/javascript" src="/js/wap_collect.js"></script> <script language="javascript" type="text/javascript" src="/js/wap_collect.js"></script>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?b3a84b2ec6cc52dd088d735565b49644";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<script>
(function () {
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
</script>
<style id="tsbrowser_video_independent_player_style" type="text/css"> <style id="tsbrowser_video_independent_player_style" type="text/css">
[tsbrowser_force_max_size] { [tsbrowser_force_max_size] {
width: 100% !important; width: 100% !important;

View File

@ -10,17 +10,6 @@
<div th:include="common/css :: css"></div> <div th:include="common/css :: css"></div>
<script>
var _hmt = _hmt || [];
(function() {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?2cf01edbc2b27cd3a143e17948167d77";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
</head> </head>
<body id="read" > <body id="read" >
@ -107,18 +96,6 @@
}); });
</script> </script>
<script> <script>
(function(){
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();

View File

@ -1,14 +1,15 @@
server: {port: 8083} server: {port: 8083}
spring: spring:
datasource: {url: 'jdbc:mysql://47.106.243.172:3306/novel?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai', datasource: {url: 'jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai',
username: novel, password: novel!8888} username: root, password: test123456}
mybatis: mybatis:
mapper-locations: classpath:mybatis/mapping/*.xml mapper-locations: classpath:mybatis/mapping/*.xml
type-aliases-package: xyz.zinglizingli.books.po type-aliases-package: xyz.zinglizingli.books.po
configuration: {log-impl: org.apache.ibatis.logging.stdout.StdOutImpl} configuration: {log-impl: org.apache.ibatis.logging.stdout.StdOutImpl}
mysql: {charset: utf8mb4} mysql: {charset: utf8mb4}
books: {lowestScore: '9.0'} books: {lowestScore: 9.0}
crawl: crawl:
website: {type: '2'} website: {type: '4'}
soft-novel: '0' soft-novel: '0'
manhua: '0' manhua: '0'
logging: {config: 'classpath:logback-boot.xml'}