mirror of
https://github.com/201206030/novel.git
synced 2025-04-27 07:30:50 +00:00
更新爬虫可插拔
This commit is contained in:
parent
82618f354e
commit
58eb59735e
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,2 +1,5 @@
|
||||
/novel-front/novel-front.iml
|
||||
/.idea
|
||||
/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BiqugeCrawlSource.java
|
||||
/novel-front/src/main/java/xyz/zinglizingli/books/core/config/CrawlBiqugeConfig.java
|
||||
/novel-front/src/main/java/xyz/zinglizingli/books/core/schedule/CrawlBookSchedule.java
|
||||
|
@ -2,10 +2,12 @@ package xyz.zinglizingli.books.core.listener;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.ApplicationListener;
|
||||
import org.springframework.context.event.ContextRefreshedEvent;
|
||||
import org.springframework.stereotype.Component;
|
||||
import xyz.zinglizingli.books.core.crawl.BaseCrawlSource;
|
||||
import xyz.zinglizingli.books.core.utils.Constants;
|
||||
|
||||
/**
|
||||
* @author 11797
|
||||
@ -17,22 +19,28 @@ public class StartListener implements ApplicationListener<ContextRefreshedEvent>
|
||||
|
||||
private final BaseCrawlSource crawlSource;
|
||||
|
||||
@Value("${crawl.book.new.enabled}")
|
||||
private String crawlEnable;
|
||||
|
||||
@Override
|
||||
public void onApplicationEvent(ContextRefreshedEvent event) {
|
||||
log.info("程序启动");
|
||||
new Thread(()->{
|
||||
while (true) {
|
||||
try {
|
||||
log.info("crawlBooks执行中。。。。。。。。。。。。");
|
||||
crawlSource.parse();
|
||||
Thread.sleep(1000 * 60 * 5);
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
if (!Constants.ENABLE_NEW_BOOK.equals(crawlEnable.trim())) {
|
||||
log.info("程序启动");
|
||||
new Thread(() -> {
|
||||
while (true) {
|
||||
try {
|
||||
|
||||
}
|
||||
}).start();
|
||||
log.info("crawlBooks执行中。。。。。。。。。。。。");
|
||||
crawlSource.parse();
|
||||
|
||||
Thread.sleep(1000 * 60 * 5);
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
}
|
||||
}).start();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -95,4 +95,9 @@ public class Constants {
|
||||
* 书籍内容页的广告pattern
|
||||
* */
|
||||
public static final String CONTENT_AD_PATTERN = "<div[^>]+app\\.html[^>]+>\\s*<div[^>]+>\\s*<div[^>]+>[^<]+</div>\\s*<div[^>]+>[^<]+<span[^>]+>>>[^<]+<<</span>\\s*</div>\\s*</div>\\s*</div>";
|
||||
|
||||
/**
|
||||
* 是否开启抓取新书
|
||||
* */
|
||||
public static final String ENABLE_NEW_BOOK = "true";
|
||||
}
|
||||
|
@ -1,7 +1,15 @@
|
||||
#是否抓取新书,true:抓
|
||||
crawl:
|
||||
book:
|
||||
new:
|
||||
enabled: false
|
||||
#抓取频率
|
||||
period: 2000
|
||||
|
||||
#爬取的网站名称类型 1:笔趣岛 ,2:笔趣塔, 3:顶点 更多网站解析中,敬请期待
|
||||
biquta:
|
||||
crawlsource:
|
||||
enabled: true #是否开启此爬虫源
|
||||
enabled: false #是否开启此爬虫源
|
||||
index-url: https://m.biquta.la
|
||||
list-page-url: https://m.biquta.la/class/{0}/{1}.html
|
||||
book-url-pattern: href="/(\d+_\d+)/"
|
||||
@ -17,7 +25,7 @@ biquta:
|
||||
catalog-pattern: <a\s+style=""\s+href="(/\d+_\d+/\d+\.html)">([^/]+)</a>
|
||||
biqudao:
|
||||
crawlsource:
|
||||
enabled: true #是否开启此爬虫源
|
||||
enabled: false #是否开启此爬虫源
|
||||
index-url: https://m.biqudao.com
|
||||
list-page-url: https://m.biqudao.com/bqgeclass/{0}/{1}.html
|
||||
book-url-pattern: href="/(bqge\d+)/"
|
||||
@ -34,7 +42,7 @@ biqudao:
|
||||
|
||||
dingdian:
|
||||
crawlsource:
|
||||
enabled: true #是否开启此爬虫源
|
||||
enabled: false #是否开启此爬虫源
|
||||
index-url: https://wap.dingdiann.com
|
||||
list-page-url: https://wap.dingdiann.com/sort/{0}/{1}.html
|
||||
book-url-pattern: href="/(ddk\d+)/"
|
||||
@ -47,4 +55,21 @@ dingdian:
|
||||
pic-pattern: <img src="([^>]+)"\s+onerror="this.src=
|
||||
intro-pattern: class="review">([^/]+)</p>
|
||||
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
|
||||
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>
|
||||
catalog-pattern: <a\s+style=""\s+href="(/ddk\d+/\d+\.html)">([^/]+)</a>
|
||||
|
||||
biquge:
|
||||
crawlsource:
|
||||
enabled: true #是否开启此爬虫源
|
||||
index-url: http://m.biquge.info
|
||||
list-page-url: http://m.biquge.info/paihangbang_lastupdate/{0}.html
|
||||
book-url-pattern: href="/(\d+_\d+)/"
|
||||
score-pattern: <i>(\d+)</i>
|
||||
book-name-pattern: <a\s+href="/(\d+_\d+)/">([^<]+)</a>
|
||||
author-pattern: 作者:([^<]+)<
|
||||
status-pattern: <p>状态:([^<]+)</p>
|
||||
cat-pattern: <a\s+href="/list/\d+_\d+\.html">([^<]+)</a>
|
||||
update-time-pattern: <p>更新:(\d+-\d+-\d+T\d+:\d+:\d+)</p>
|
||||
pic-pattern: <div\s+class="block_img2">\s*<img\s+src="([^"]+)"
|
||||
intro-pattern: class="review">([^/]+)</p>
|
||||
catalog-url-pattern: <a\s+href="(/ddk\d+/all.html)">查看完整目录</a>
|
||||
catalog-pattern: <dd>\s*<a\s+href="(\d+\.html)"\s+title="([^"]+)">([^<]+)</a>\s*</dd>
|
@ -3,9 +3,9 @@ server:
|
||||
|
||||
spring:
|
||||
datasource:
|
||||
url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
|
||||
username: root
|
||||
password: test123456
|
||||
url: jdbc:mysql://35.236.132.9:3306/books?useUnicode=true&characterEncoding=utf-8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
|
||||
username: books
|
||||
password: 123
|
||||
# url: jdbc:mysql://127.0.0.1:3306/books?useUnicode=true&characterEncoding=utf8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
|
||||
# username: root
|
||||
# password: test123456
|
||||
|
Loading…
x
Reference in New Issue
Block a user