爬虫优化,提升兼容性

This commit is contained in:
xiongxiaoyang 2020-12-09 16:39:15 +08:00
parent 154210719f
commit f830600c3e
4 changed files with 43 additions and 11 deletions

View File

@ -1,9 +1,6 @@
package com.java2nb.novel.core.utils;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.http.*;
import org.springframework.web.client.RestTemplate;
/**
@ -28,4 +25,23 @@ public class HttpUtil {
return null;
}
}
public static String getByHttpClientWithChrome(String url) {
try {
HttpHeaders headers = new HttpHeaders();
headers.add("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
ResponseEntity<String> forEntity = restTemplate.exchange(url.toString(), HttpMethod.GET, requestEntity, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
} else {
return null;
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}

View File

@ -11,8 +11,7 @@ import com.java2nb.novel.utils.Constants;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.http.*;
import org.springframework.web.client.RestTemplate;
import java.text.SimpleDateFormat;
@ -44,7 +43,7 @@ public class CrawlParser {
public static Book parseBook(RuleBean ruleBean, String bookId) {
Book book = new Book();
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = getByHttpClient(bookDetailUrl);
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
if (bookDetailHtml != null) {
Pattern bookNamePatten = compile(ruleBean.getBookNamePatten());
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
@ -157,7 +156,7 @@ public class CrawlParser {
List<BookContent> contentList = new ArrayList<>();
//读取目录
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
String indexListHtml = getByHttpClient(indexListUrl);
String indexListHtml = getByHttpClientWithChrome(indexListUrl);
if (indexListHtml != null) {
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
@ -189,7 +188,7 @@ public class CrawlParser {
String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1));
//查询章节内容
String contentHtml = getByHttpClient(contentUrl);
String contentHtml = getByHttpClientWithChrome(contentUrl);
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
@ -280,6 +279,22 @@ public class CrawlParser {
}
private static String getByHttpClientWithChrome(String url) {
try {
String body = HttpUtil.getByHttpClientWithChrome(url);
if(body != null && body.length() < Constants.INVALID_HTML_LENGTH){
return processErrorHttpResult(url);
}
//成功获得html内容
return body;
} catch (Exception e) {
e.printStackTrace();
}
return processErrorHttpResult(url);
}
@SneakyThrows
private static String processErrorHttpResult(String url){
Integer count = retryCount.get();

View File

@ -30,6 +30,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClient;
import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClientWithChrome;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId;
import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId;
import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*;
@ -217,7 +218,7 @@ public class CrawlServiceImpl implements CrawlService {
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
.replace("{page}", page + "");
String bookListHtml = getByHttpClient(catBookListUrl);
String bookListHtml = getByHttpClientWithChrome(catBookListUrl);
if (bookListHtml != null) {
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);

View File

@ -117,7 +117,7 @@
示例<b></b>
<li><input type="text" id="visitCountPatten" class="s_input icon_key"
placeholder="小说点击量的正则表达式:"></li>
示例<b>&lt;p class=\"review\"&gt;</b>
示例<b>&lt;p class="review"&gt;</b>
<li><input type="text" id="descStart" class="s_input icon_key"
placeholder="小说简介开始截取字符串:"></li>
示例<b>&lt;/p&gt;</b>