diff --git a/novel-admin/src/main/java/com/java2nb/books/config/CrawlConfig.java b/novel-admin/src/main/java/com/java2nb/books/config/CrawlConfig.java new file mode 100644 index 0000000..f056ad4 --- /dev/null +++ b/novel-admin/src/main/java/com/java2nb/books/config/CrawlConfig.java @@ -0,0 +1,19 @@ +package com.java2nb.books.config; + +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.stereotype.Component; + +import java.util.Date; + +@Data +@Component +@ConfigurationProperties(prefix="crawl") +public class CrawlConfig { + + private Integer threadCount; + private Integer priority; + private Float lowestScore; + private String minUptTime; + private Integer maxNumber; +} diff --git a/novel-admin/src/main/java/com/java2nb/books/controller/BookCrawlController.java b/novel-admin/src/main/java/com/java2nb/books/controller/BookCrawlController.java index fb73bf5..04e3c89 100644 --- a/novel-admin/src/main/java/com/java2nb/books/controller/BookCrawlController.java +++ b/novel-admin/src/main/java/com/java2nb/books/controller/BookCrawlController.java @@ -1,8 +1,13 @@ package com.java2nb.books.controller; -import java.util.List; -import java.util.Map; +import java.util.*; +import com.java2nb.books.config.CrawlConfig; +import com.java2nb.common.utils.GenUtils; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.configuration.Configuration; +import org.apache.commons.configuration.ConfigurationException; +import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.shiro.authz.annotation.RequiresPermissions; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Controller; @@ -30,12 +35,16 @@ import com.java2nb.common.utils.R; * @date 2019-11-15 03:42:54 */ +@Slf4j @Controller @RequestMapping("/books/bookCrawl") public class BookCrawlController { @Autowired private BookCrawlService bookCrawlService; + @Autowired + private CrawlConfig crawlConfig; + @GetMapping() @RequiresPermissions("books:bookCrawl:bookCrawl") String BookCrawl() { @@ -63,11 +72,9 @@ public class BookCrawlController { } @ApiOperation(value = "修改页面", notes = "修改页面") - @GetMapping("/edit/{id}") - @RequiresPermissions("books:bookCrawl:edit") - String edit(@PathVariable("id") Long id, Model model) { - BookCrawlDO bookCrawl = bookCrawlService.get(id); - model.addAttribute("bookCrawl", bookCrawl); + @GetMapping("/edit") + String edit( Model model) throws Exception { + model.addAttribute("property", crawlConfig); return "books/bookCrawl/edit"; } @@ -100,9 +107,8 @@ public class BookCrawlController { @ApiOperation(value = "修改", notes = "修改") @ResponseBody @RequestMapping("/update") - @RequiresPermissions("books:bookCrawl:edit") - public R update( BookCrawlDO bookCrawl) { - bookCrawlService.update(bookCrawl); + public R update(CrawlConfig config) { + crawlConfig = config; return R.ok(); } diff --git a/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java b/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java index bc6ad05..793dc84 100644 --- a/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java +++ b/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java @@ -1,5 +1,6 @@ package com.java2nb.books.service.impl; +import com.java2nb.books.config.CrawlConfig; import com.java2nb.books.dao.BookContentDao; import com.java2nb.books.dao.BookDao; import com.java2nb.books.dao.BookIndexDao; @@ -7,6 +8,7 @@ import com.java2nb.books.domain.BookContentDO; import com.java2nb.books.domain.BookDO; import com.java2nb.books.domain.BookIndexDO; import com.java2nb.books.util.RestTemplateUtil; +import com.java2nb.common.utils.DateUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; @@ -26,6 +28,9 @@ import org.springframework.web.client.RestTemplate; @Service public class BookCrawlServiceImpl implements BookCrawlService { + @Autowired + private CrawlConfig crawlConfig; + private boolean isInteruptBiquDaoCrawl;//是否中断笔趣岛爬虫程序 private boolean isInteruptBiquTaCrawl;//是否中断笔趣塔爬虫程序 @@ -103,59 +108,66 @@ public class BookCrawlServiceImpl implements BookCrawlService { private void crawlBook(BookCrawlDO bookCrawl) { - for (int i = 1; i <= 7; i++) { - - int finalI = i; + int threadCount = crawlConfig.getThreadCount(); + int step = 7 / threadCount; + int pos = step; + int i = 1; + while (i <= 7) { + final int fPos = pos; + final int fI = i; + i = pos + 1; new Thread( () -> { + int j = fI; + for (; j <= fPos; j++) { + try { - try { - - switch (bookCrawl.getCrawlWebCode()) { - case 1: { - while (true) { - if (isInteruptBiquDaoCrawl) { - return; + switch (bookCrawl.getCrawlWebCode()) { + case 1: { + while (true) { + if (isInteruptBiquDaoCrawl) { + return; + } + crawBiqudaoBooks(j); + Thread.sleep(1000 * 60 * 60 * 24); } - crawBiqudaoBooks(finalI); - Thread.sleep(1000 * 60 * 60 * 24); } - } - case 2: { - while (true) { - if (isInteruptBiquTaCrawl) { - return; + case 2: { + while (true) { + if (isInteruptBiquTaCrawl) { + return; + } + crawBiquTaBooks(j); + Thread.sleep(1000 * 60 * 60 * 24); } - crawBiquTaBooks(finalI); - Thread.sleep(1000 * 60 * 60 * 24); } - } + } + } catch (Exception e) { + e.printStackTrace(); } - } catch (Exception e) { - e.printStackTrace(); - bookCrawl.setStatus(0); - bookCrawlDao.update(bookCrawl); } } ).start(); + pos += step; + if (7 - pos < step) { + pos = 7; + } } + new Thread(() -> { + for (int j = 21; j <= 29; j++) { - for (int j = 21; j <= 29; j++) { - int finalJ = j; - new Thread(() -> { - - for (int i = 1; i <= 499; i++) { - if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){ + for (int k = 1; k <= 499; k++) { + if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) { return; } - System.out.println("==============分类============:" + finalJ); - System.out.println("==============页码============:" + i); - int catId = finalJ; - int page = i; + System.out.println("==============分类============:" + j); + System.out.println("==============页码============:" + k); + int catId = j; + int page = k; String bookListUrl = "http://book.sfacg.com/List/default.aspx?&tid=" + catId + "&if=1&PageIndex=" + page; @@ -168,7 +180,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { while (isFindBook) { try { - if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){ + if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) { return; } long bookNum = Long.parseLong(bookMatcher.group(1)); @@ -320,7 +332,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { List hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author); while (isFindIndex) { - if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){ + if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) { return; } if (!hasIndexNum.contains(indexNum)) { @@ -378,16 +390,18 @@ public class BookCrawlServiceImpl implements BookCrawlService { } } - }).start(); - - } + } + }).start(); } private void crawBiquTaBooks(int i) { String baseUrl = "https://m.biquta.com"; String catBookListUrlBase = baseUrl + "/class/"; + if (crawlConfig.getPriority() == 1) { + catBookListUrlBase = baseUrl + "/lhb/"; + } //拼接分类URL int page = 1;//起始页码 int totalPage = page; @@ -457,10 +471,9 @@ public class BookCrawlServiceImpl implements BookCrawlService { try { Float score = Float.parseFloat(scoreMatch.group(1)); - /*if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说 - // Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜,所以遇到第一个8分以下的,之后的都是8分以下的 - continue; - }*/ + if (score < crawlConfig.getLowestScore()) {//数据库空间有限,暂时爬取8.0分以上的小说 + continue; + } String bookName = bookNameMatch.group(1); String author = authoreMatch.group(1); @@ -487,6 +500,9 @@ public class BookCrawlServiceImpl implements BookCrawlService { String updateTimeStr = updateTimeMatch.group(1); SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); Date updateTime = format.parse(updateTimeStr); + if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) { + continue; + } Pattern picPatten = Pattern.compile("]+)\"\\s+onerror=\"this.src="); Matcher picMather = picPatten.matcher(body); if (picMather.find()) { @@ -605,7 +621,11 @@ public class BookCrawlServiceImpl implements BookCrawlService { private void crawBiqudaoBooks(final int i) { String baseUrl = "https://m.biqudao.com"; - String catBookListUrlBase = baseUrl + "/bqgelhb/"; + String catBookListUrlBase = baseUrl + "/bqgeclass/"; + if (crawlConfig.getPriority() == 1) { + + catBookListUrlBase = baseUrl + "/bqgelhb/"; + } //拼接分类URL int page = 1;//起始页码 int totalPage = page; @@ -680,10 +700,9 @@ public class BookCrawlServiceImpl implements BookCrawlService { Float score = Float.parseFloat(scoreMatch.group(1)); - /*if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说 - Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜,所以遇到第一个8分以下的,之后的都是8分以下的 - continue; - }*/ + if (score < crawlConfig.getLowestScore()) {//数据库空间有限,暂时爬取8.0分以上的小说 + continue; + } String bookName = bookNameMatch.group(1); String author = authoreMatch.group(1); @@ -710,6 +729,9 @@ public class BookCrawlServiceImpl implements BookCrawlService { String updateTimeStr = updateTimeMatch.group(1); SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); Date updateTime = format.parse(updateTimeStr); + if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) { + continue; + } Pattern picPatten = Pattern.compile("]+)\"\\s+onerror=\"this.src="); Matcher picMather = picPatten.matcher(body); if (picMather.find()) { diff --git a/novel-admin/src/main/resources/application-crawl.yml b/novel-admin/src/main/resources/application-crawl.yml new file mode 100644 index 0000000..d8a7602 --- /dev/null +++ b/novel-admin/src/main/resources/application-crawl.yml @@ -0,0 +1,11 @@ +crawl: + #爬虫线程数 + threadCount: 1 + #爬取优先级 1:评分优先 2:更新时间优先 + priority: 1 + #小说最低评分 + lowestScore: 7.8 + #小说最小更新时间 + minUptTime: 2000-01-01 + #爬取最大条数 + maxNumber: 100000 \ No newline at end of file diff --git a/novel-admin/src/main/resources/application.yml b/novel-admin/src/main/resources/application.yml index c6d3c32..cd945a7 100644 --- a/novel-admin/src/main/resources/application.yml +++ b/novel-admin/src/main/resources/application.yml @@ -31,6 +31,7 @@ spring: type: redis + datasource: type: com.alibaba.druid.pool.DruidDataSource driverClassName: com.mysql.jdbc.Driver @@ -76,6 +77,8 @@ spring: max-active: 100 # 连接池最大阻塞等待时间(使用负值表示没有限制) max-wait: -1 + profiles: + include: crawl mybatis: configuration: #自动将数据库带下划线的表字段值映射到Java类的驼峰字段上 diff --git a/novel-admin/src/main/resources/crawl.properties b/novel-admin/src/main/resources/crawl.properties index addb32e..86ed7c0 100644 --- a/novel-admin/src/main/resources/crawl.properties +++ b/novel-admin/src/main/resources/crawl.properties @@ -2,9 +2,9 @@ threadCount=1 #ȡȼ 1 2ʱ priority=1 -#С˵֣0ʾ +#С˵ lowestScore=0 #С˵Сʱ -minUptTime=2000-01-01 00:00:00 -#ȡ,0ʾ -maxNumber=0 +minUptTime=2000-01-01 +#ȡ +maxNumber=100000 diff --git a/novel-admin/src/main/resources/static/js/appjs/books/bookCrawl/bookCrawl.js b/novel-admin/src/main/resources/static/js/appjs/books/bookCrawl/bookCrawl.js index a46aaa5..9905646 100644 --- a/novel-admin/src/main/resources/static/js/appjs/books/bookCrawl/bookCrawl.js +++ b/novel-admin/src/main/resources/static/js/appjs/books/bookCrawl/bookCrawl.js @@ -163,7 +163,7 @@ function edit(){ console.log('打开配置页面'); layer.open({ type : 2, - title : '增加', + title : '爬虫配置修改', maxmin : true, shadeClose : false, area : [ '800px', '520px' ], diff --git a/novel-admin/src/main/resources/templates/books/bookCrawl/edit.html b/novel-admin/src/main/resources/templates/books/bookCrawl/edit.html index 5fedc83..125a8e9 100644 --- a/novel-admin/src/main/resources/templates/books/bookCrawl/edit.html +++ b/novel-admin/src/main/resources/templates/books/bookCrawl/edit.html @@ -9,41 +9,57 @@
- -
- -
- -
+
+ +
+ + + +
+
+
+ +
+
+ +
-
- -
- -
-
-
- -
- -
-
-
- -
- -
-
-
+
+
+
+ +
+ +
+
+
+ +
+ +
+
+ +