更新爬虫可插拔

This commit is contained in:
xiongxiaoyang 2019-12-23 13:48:46 +08:00
parent 82618f354e
commit 58eb59735e
5 changed files with 61 additions and 20 deletions

3
.gitignore vendored
View File

@ -1,2 +1,5 @@
/novel-front/novel-front.iml
/.idea
/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BiqugeCrawlSource.java
/novel-front/src/main/java/xyz/zinglizingli/books/core/config/CrawlBiqugeConfig.java
/novel-front/src/main/java/xyz/zinglizingli/books/core/schedule/CrawlBookSchedule.java

View File

@ -2,10 +2,12 @@ package xyz.zinglizingli.books.core.listener;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.ApplicationListener;
import org.springframework.context.event.ContextRefreshedEvent;
import org.springframework.stereotype.Component;
import xyz.zinglizingli.books.core.crawl.BaseCrawlSource;
import xyz.zinglizingli.books.core.utils.Constants;
/**
* @author 11797
@ -17,22 +19,28 @@ public class StartListener implements ApplicationListener<ContextRefreshedEvent>
private final BaseCrawlSource crawlSource;
@Value("${crawl.book.new.enabled}")
private String crawlEnable;
@Override
public void onApplicationEvent(ContextRefreshedEvent event) {
log.info("程序启动");
new Thread(()->{
while (true) {
try {
log.info("crawlBooks执行中。。。。。。。。。。。。");
crawlSource.parse();
Thread.sleep(1000 * 60 * 5);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
if (!Constants.ENABLE_NEW_BOOK.equals(crawlEnable.trim())) {
log.info("程序启动");
new Thread(() -> {
while (true) {
try {
}
}).start();
log.info("crawlBooks执行中。。。。。。。。。。。。");
crawlSource.parse();
Thread.sleep(1000 * 60 * 5);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
}).start();
}
}
}

View File

@ -95,4 +95,9 @@ public class Constants {
* 书籍内容页的广告pattern
* */
public static final String CONTENT_AD_PATTERN = "<div[^>]+app\\.html[^>]+>\\s*<div[^>]+>\\s*<div[^>]+>[^<]+</div>\\s*<div[^>]+>[^<]+<span[^>]+>>>[^<]+<<</span>\\s*</div>\\s*</div>\\s*</div>";
/**
* 是否开启抓取新书
* */
public static final String ENABLE_NEW_BOOK = "true";
}

View File

@ -1,7 +1,15 @@
#是否抓取新书true
crawl:
book:
new:
enabled: false
#抓取频率
period: 2000
#爬取的网站名称类型 1笔趣岛 2笔趣塔, 3:顶点 更多网站解析中,敬请期待
biquta:
crawlsource:
enabled: true #是否开启此爬虫源
enabled: false #是否开启此爬虫源
index-url: https://m.biquta.la
list-page-url: https://m.biquta.la/class/{0}/{1}.html
book-url-pattern: href="/(\d+_\d+)/"
@ -17,7 +25,7 @@ biquta:
catalog-pattern: <a\s+style=""\s+href="(/\d+_\d+/\d+\.html)">([^/]+)</a>
biqudao:
crawlsource:
enabled: true #是否开启此爬虫源
enabled: false #是否开启此爬虫源
index-url: https://m.biqudao.com
list-page-url: https://m.biqudao.com/bqgeclass/{0}/{1}.html
book-url-pattern: href="/(bqge\d+)/"
@ -34,7 +42,7 @@ biqudao:
dingdian:
crawlsource:
enabled: true #是否开启此爬虫源
enabled: false #是否开启此爬虫源
index-url: https://wap.dingdiann.com
list-page-url: https://wap.dingdiann.com/sort/{0}/{1}.html
book-url-pattern: href="/(ddk\d+)/"
@ -47,4 +55,21 @@ dingdian:
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
intro-pattern: class="review">([^/]+)</p>
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>
biquge:
crawlsource:
enabled: true #是否开启此爬虫源
index-url: http://m.biquge.info
list-page-url: http://m.biquge.info/paihangbang_lastupdate/{0}.html
book-url-pattern: href="/(\d+_\d+)/"
score-pattern: <i>(\d+)</i>
book-name-pattern: <a\s+href="/(\d+_\d+)/">([^<]+)</a>
author-pattern: 作者:([^<]+)<
status-pattern: <p>状态:([^<]+)</p>
cat-pattern: <a\s+href="/list/\d+_\d+\.html">([^<]+)</a>
update-time-pattern: <p>更新:(\d+-\d+-\d+T\d+:\d+:\d+)</p>
pic-pattern: <div\s+class="block_img2">\s*<img\s+src="([^"]+)"
intro-pattern: class="review">([^/]+)</p>
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
catalog-pattern: <dd>\s*<a\s+href="(\d+\.html)"\s+title="([^"]+)">([^<]+)</a>\s*</dd>

View File

@ -3,9 +3,9 @@ server:
spring:
datasource:
url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
username: root
password: test123456
url: jdbc:mysql://35.236.132.9:3306/books?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
username: books
password: 123
# url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
# username: root
# password: test123456