1.优化爬虫编写规则，兼容更多网站 2.新增书趣阁书源

2025-06-24 04:46:37 +00:00 · 2020-05-18 14:01:49 +08:00
parent 8c2e43c04f
commit 92ce982899
10 changed files with 339 additions and 79 deletions
--- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java
@ -3,12 +3,17 @@ package com.java2nb.novel.core.crawl;
 import com.java2nb.novel.core.utils.HttpUtil;
 import com.java2nb.novel.core.utils.IdWorker;
 import com.java2nb.novel.core.utils.RandomBookInfoUtil;
+import com.java2nb.novel.core.utils.RestTemplateUtil;
 import com.java2nb.novel.entity.Book;
 import com.java2nb.novel.entity.BookContent;
 import com.java2nb.novel.entity.BookIndex;
 import com.java2nb.novel.utils.Constants;
 import lombok.SneakyThrows;
+import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.StringUtils;
+import org.springframework.http.HttpStatus;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.client.RestTemplate;

 import java.text.SimpleDateFormat;
 import java.util.*;
@ -22,17 +27,20 @@ import static java.util.regex.Pattern.compile;
 *
 * @author Administrator
 */
+@Slf4j
 public class CrawlParser {

    public static final Integer BOOK_INDEX_LIST_KEY = 1;

    public static final Integer BOOK_CONTENT_LIST_KEY = 2;

+    private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
+
    @SneakyThrows
    public static Book parseBook(RuleBean ruleBean, String bookId) {
        Book book = new Book();
        String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
-        String bookDetailHtml = HttpUtil.getByHttpClient(bookDetailUrl);
+        String bookDetailHtml = getByHttpClient(bookDetailUrl);
        if (bookDetailHtml != null) {
            Pattern bookNamePatten = compile(ruleBean.getBookNamePatten());
            Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
@ -54,6 +62,9 @@ public class CrawlParser {
                        boolean isFindPicUrl = picUrlMatch.find();
                        if (isFindPicUrl) {
                            String picUrl = picUrlMatch.group(1);
+                            if(StringUtils.isNotBlank(picUrl) && StringUtils.isNotBlank(ruleBean.getPicUrlPrefix())) {
+                                picUrl = ruleBean.getPicUrlPrefix() + picUrl;
+                            }
                            //设置封面图片路径
                            book.setPicUrl(picUrl);
                        }
@ -136,7 +147,10 @@ public class CrawlParser {
        List<BookContent> contentList = new ArrayList<>();
        //读取目录
        String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
-        String indexListHtml = HttpUtil.getByHttpClient(indexListUrl);
+        String indexListHtml = getByHttpClient(indexListUrl);
+        if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
+            indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
+        }
        if (indexListHtml != null) {
            Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten());
            Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml);
@ -162,7 +176,7 @@ public class CrawlParser {
                    String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1));

                    //查询章节内容
-                    String contentHtml = HttpUtil.getByHttpClient(contentUrl);
+                    String contentHtml = getByHttpClient(contentUrl);
                    if (contentHtml != null) {
                        String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
                        content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
@ -237,4 +251,25 @@ public class CrawlParser {
    }


+    private static String getByHttpClient(String url) {
+        try {
+            ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
+            if (forEntity.getStatusCode() == HttpStatus.OK) {
+                String body = forEntity.getBody();
+                if(body.length() < Constants.INVALID_HTML_LENGTH){
+                    log.debug("获取html页面内容失败");
+                    Thread.sleep(10 + new Random().nextInt(60));
+                    return getByHttpClient(url);
+                }
+                return body;
+            } else {
+                return null;
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+
+
 }
--- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java
@ -11,6 +11,14 @@ import java.util.Map;
@Data
 public class RuleBean {

+    /**
+     * 小说更新列表url
+     * */
+    private String updateBookListUrl;
+
+    /**
+     * 分类列表页URL规则
+     * */
    private String bookListUrl;

    private Map<String,String> catIdRule;
@ -39,4 +47,9 @@ public class RuleBean {
    private String contentEnd;


+    private String picUrlPrefix;
+
+    private String bookIndexStart;
+
+
 }
--- a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java
@ -14,4 +14,9 @@ public class Constants {
     * 访问量默认值
     */
    public static final Long VISIT_COUNT_DEFAULT = 100L;
+
+    /**
+     * 爬取小说http请求中无效的内容长度
+     */
+    public static final int INVALID_HTML_LENGTH = 1000;
 }
--- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html
+++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html
@ -57,6 +57,9 @@
                                <li><span id="LabErr"></span></li>
                                示例：<b>新顶点小说网</b>
                                <li><input type="text" id="sourceName" class="s_input icon_name" placeholder="源站名"></li>
+                                <!--示例：<b>https://m.xdingdiann.com/sort/0/1.html</b>
+                                <li><input type="text" id="updateBookListUrl" class="s_input icon_key"
+                                           placeholder="小说更新列表url"></li>-->
                                示例：<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID，{page}代表分页页码)
                                <li><input type="text" id="bookListUrl" class="s_input icon_key"
                                           placeholder="分类列表页URL规则"></li>
@ -95,6 +98,9 @@
                                示例：<b>&lt;img src="([^>]+)"\s+onerror="this.src=</b>
                                <li><input type="text" id="picUrlPatten" class="s_input icon_key"
                                           placeholder="小说图片路径的正则表达式："></li>
+                                <b>可空，适用于图片路径为相对路径的源站，加上小说图片路径，则为完整的可访问的图片路径</b>
+                                <li><input type="text" id="picUrlPrefix" class="s_input icon_key"
+                                           placeholder="小说图片访问路径前缀："></li>
                                示例：<b>状态：([^/]+)&lt;/li&gt;</b>
                                <li><input type="text" id="statusPatten" class="s_input icon_key"
                                           placeholder="小说状态的正则表达式："></li>
@ -125,6 +131,9 @@
                                示例：<b>http://m.xdingdiann.com/ddk{bookId}/all.html</b> (bookId代表小说ID)
                                <li><input type="text" id="bookIndexUrl" class="s_input icon_key"
                                           placeholder="小说目录页的URL规则："></li>
+                                <b>可空，适用于最新章节列表和全部章节列表在同一个页面的源站</b>
+                                <li><input type="text" id="bookIndexStart" class="s_input icon_key"
+                                           placeholder="小说目录页内容开始截取字符串："></li>
                                示例：<b>&lt;a\s+style=""\s+href="/ddk\d+/(\d+)\.html"&gt;[^/]+&lt;/a&gt;</b>
                                <li><input type="text" id="indexIdPatten" class="s_input icon_key"
                                           placeholder="目录页目录ID正则表达式："></li>
@ -278,6 +287,12 @@
            crawlRule.picUrlPatten = picUrlPatten;
        }

+        var picUrlPrefix = $("#picUrlPrefix").val();
+
+        if (picUrlPrefix.length > 0) {
+            crawlRule.picUrlPrefix = picUrlPrefix;
+        }
+
        var statusPatten = $("#statusPatten").val();
        if (statusPatten.length > 0) {
            crawlRule.statusPatten = statusPatten;
@ -345,6 +360,13 @@

        crawlRule.bookIndexUrl = bookIndexUrl;

+
+        var bookIndexStart = $("#bookIndexStart").val();
+
+        if (bookIndexStart.length > 0) {
+            crawlRule.bookIndexStart = bookIndexStart;
+        }
+
        var indexIdPatten = $("#indexIdPatten").val();

        if (indexIdPatten.length == 0) {