mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-27 01:30:51 +00:00
爬虫优化,提升兼容性
This commit is contained in:
parent
154210719f
commit
f830600c3e
@ -1,9 +1,6 @@
|
|||||||
package com.java2nb.novel.core.utils;
|
package com.java2nb.novel.core.utils;
|
||||||
|
|
||||||
import org.apache.http.client.HttpClient;
|
import org.springframework.http.*;
|
||||||
import org.apache.http.impl.client.DefaultHttpClient;
|
|
||||||
import org.springframework.http.HttpStatus;
|
|
||||||
import org.springframework.http.ResponseEntity;
|
|
||||||
import org.springframework.web.client.RestTemplate;
|
import org.springframework.web.client.RestTemplate;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -28,4 +25,23 @@ public class HttpUtil {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getByHttpClientWithChrome(String url) {
|
||||||
|
try {
|
||||||
|
|
||||||
|
HttpHeaders headers = new HttpHeaders();
|
||||||
|
headers.add("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
|
||||||
|
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
|
||||||
|
ResponseEntity<String> forEntity = restTemplate.exchange(url.toString(), HttpMethod.GET, requestEntity, String.class);
|
||||||
|
|
||||||
|
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||||
|
return forEntity.getBody();
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,8 +11,7 @@ import com.java2nb.novel.utils.Constants;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.springframework.http.HttpStatus;
|
import org.springframework.http.*;
|
||||||
import org.springframework.http.ResponseEntity;
|
|
||||||
import org.springframework.web.client.RestTemplate;
|
import org.springframework.web.client.RestTemplate;
|
||||||
|
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
@ -44,7 +43,7 @@ public class CrawlParser {
|
|||||||
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
||||||
Book book = new Book();
|
Book book = new Book();
|
||||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||||
String bookDetailHtml = getByHttpClient(bookDetailUrl);
|
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
|
||||||
if (bookDetailHtml != null) {
|
if (bookDetailHtml != null) {
|
||||||
Pattern bookNamePatten = compile(ruleBean.getBookNamePatten());
|
Pattern bookNamePatten = compile(ruleBean.getBookNamePatten());
|
||||||
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
||||||
@ -157,7 +156,7 @@ public class CrawlParser {
|
|||||||
List<BookContent> contentList = new ArrayList<>();
|
List<BookContent> contentList = new ArrayList<>();
|
||||||
//读取目录
|
//读取目录
|
||||||
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
||||||
String indexListHtml = getByHttpClient(indexListUrl);
|
String indexListHtml = getByHttpClientWithChrome(indexListUrl);
|
||||||
|
|
||||||
if (indexListHtml != null) {
|
if (indexListHtml != null) {
|
||||||
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
|
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
|
||||||
@ -189,7 +188,7 @@ public class CrawlParser {
|
|||||||
String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1));
|
String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1));
|
||||||
|
|
||||||
//查询章节内容
|
//查询章节内容
|
||||||
String contentHtml = getByHttpClient(contentUrl);
|
String contentHtml = getByHttpClientWithChrome(contentUrl);
|
||||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||||
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||||
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
||||||
@ -280,6 +279,22 @@ public class CrawlParser {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String getByHttpClientWithChrome(String url) {
|
||||||
|
try {
|
||||||
|
|
||||||
|
String body = HttpUtil.getByHttpClientWithChrome(url);
|
||||||
|
if(body != null && body.length() < Constants.INVALID_HTML_LENGTH){
|
||||||
|
return processErrorHttpResult(url);
|
||||||
|
}
|
||||||
|
//成功获得html内容
|
||||||
|
return body;
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return processErrorHttpResult(url);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private static String processErrorHttpResult(String url){
|
private static String processErrorHttpResult(String url){
|
||||||
Integer count = retryCount.get();
|
Integer count = retryCount.get();
|
||||||
|
@ -30,6 +30,7 @@ import java.util.regex.Matcher;
|
|||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClient;
|
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClient;
|
||||||
|
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClientWithChrome;
|
||||||
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId;
|
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId;
|
||||||
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId;
|
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId;
|
||||||
import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*;
|
import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*;
|
||||||
@ -217,7 +218,7 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
|
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
|
||||||
.replace("{page}", page + "");
|
.replace("{page}", page + "");
|
||||||
|
|
||||||
String bookListHtml = getByHttpClient(catBookListUrl);
|
String bookListHtml = getByHttpClientWithChrome(catBookListUrl);
|
||||||
if (bookListHtml != null) {
|
if (bookListHtml != null) {
|
||||||
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
||||||
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
||||||
|
@ -117,7 +117,7 @@
|
|||||||
示例:<b></b>
|
示例:<b></b>
|
||||||
<li><input type="text" id="visitCountPatten" class="s_input icon_key"
|
<li><input type="text" id="visitCountPatten" class="s_input icon_key"
|
||||||
placeholder="小说点击量的正则表达式:"></li>
|
placeholder="小说点击量的正则表达式:"></li>
|
||||||
示例:<b><p class=\"review\"></b>
|
示例:<b><p class="review"></b>
|
||||||
<li><input type="text" id="descStart" class="s_input icon_key"
|
<li><input type="text" id="descStart" class="s_input icon_key"
|
||||||
placeholder="小说简介开始截取字符串:"></li>
|
placeholder="小说简介开始截取字符串:"></li>
|
||||||
示例:<b></p></b>
|
示例:<b></p></b>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user