mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-27 01:30:51 +00:00
爬虫优化,提升兼容性
This commit is contained in:
parent
154210719f
commit
f830600c3e
@ -1,9 +1,6 @@
|
||||
package com.java2nb.novel.core.utils;
|
||||
|
||||
import org.apache.http.client.HttpClient;
|
||||
import org.apache.http.impl.client.DefaultHttpClient;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.http.*;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
/**
|
||||
@ -28,4 +25,23 @@ public class HttpUtil {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static String getByHttpClientWithChrome(String url) {
|
||||
try {
|
||||
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.add("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
|
||||
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
|
||||
ResponseEntity<String> forEntity = restTemplate.exchange(url.toString(), HttpMethod.GET, requestEntity, String.class);
|
||||
|
||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||
return forEntity.getBody();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -11,8 +11,7 @@ import com.java2nb.novel.utils.Constants;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.http.*;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
@ -44,7 +43,7 @@ public class CrawlParser {
|
||||
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
||||
Book book = new Book();
|
||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||
String bookDetailHtml = getByHttpClient(bookDetailUrl);
|
||||
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
|
||||
if (bookDetailHtml != null) {
|
||||
Pattern bookNamePatten = compile(ruleBean.getBookNamePatten());
|
||||
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
||||
@ -157,7 +156,7 @@ public class CrawlParser {
|
||||
List<BookContent> contentList = new ArrayList<>();
|
||||
//读取目录
|
||||
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
||||
String indexListHtml = getByHttpClient(indexListUrl);
|
||||
String indexListHtml = getByHttpClientWithChrome(indexListUrl);
|
||||
|
||||
if (indexListHtml != null) {
|
||||
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
|
||||
@ -189,7 +188,7 @@ public class CrawlParser {
|
||||
String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1));
|
||||
|
||||
//查询章节内容
|
||||
String contentHtml = getByHttpClient(contentUrl);
|
||||
String contentHtml = getByHttpClientWithChrome(contentUrl);
|
||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
||||
@ -280,6 +279,22 @@ public class CrawlParser {
|
||||
|
||||
}
|
||||
|
||||
private static String getByHttpClientWithChrome(String url) {
|
||||
try {
|
||||
|
||||
String body = HttpUtil.getByHttpClientWithChrome(url);
|
||||
if(body != null && body.length() < Constants.INVALID_HTML_LENGTH){
|
||||
return processErrorHttpResult(url);
|
||||
}
|
||||
//成功获得html内容
|
||||
return body;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return processErrorHttpResult(url);
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static String processErrorHttpResult(String url){
|
||||
Integer count = retryCount.get();
|
||||
|
@ -30,6 +30,7 @@ import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClient;
|
||||
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClientWithChrome;
|
||||
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId;
|
||||
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId;
|
||||
import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*;
|
||||
@ -217,7 +218,7 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
|
||||
.replace("{page}", page + "");
|
||||
|
||||
String bookListHtml = getByHttpClient(catBookListUrl);
|
||||
String bookListHtml = getByHttpClientWithChrome(catBookListUrl);
|
||||
if (bookListHtml != null) {
|
||||
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
||||
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
||||
|
@ -117,7 +117,7 @@
|
||||
示例:<b></b>
|
||||
<li><input type="text" id="visitCountPatten" class="s_input icon_key"
|
||||
placeholder="小说点击量的正则表达式:"></li>
|
||||
示例:<b><p class=\"review\"></b>
|
||||
示例:<b><p class="review"></b>
|
||||
<li><input type="text" id="descStart" class="s_input icon_key"
|
||||
placeholder="小说简介开始截取字符串:"></li>
|
||||
示例:<b></p></b>
|
||||
|
Loading…
x
Reference in New Issue
Block a user