diff --git a/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java b/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java index fa689ac..d1d65a1 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java @@ -1,9 +1,6 @@ package com.java2nb.novel.core.utils; -import org.apache.http.client.HttpClient; -import org.apache.http.impl.client.DefaultHttpClient; -import org.springframework.http.HttpStatus; -import org.springframework.http.ResponseEntity; +import org.springframework.http.*; import org.springframework.web.client.RestTemplate; /** @@ -28,4 +25,23 @@ public class HttpUtil { return null; } } + + public static String getByHttpClientWithChrome(String url) { + try { + + HttpHeaders headers = new HttpHeaders(); + headers.add("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"); + HttpEntity requestEntity = new HttpEntity<>(null, headers); + ResponseEntity forEntity = restTemplate.exchange(url.toString(), HttpMethod.GET, requestEntity, String.class); + + if (forEntity.getStatusCode() == HttpStatus.OK) { + return forEntity.getBody(); + } else { + return null; + } + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index a643943..f9e144f 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -11,8 +11,7 @@ import com.java2nb.novel.utils.Constants; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; -import org.springframework.http.HttpStatus; -import org.springframework.http.ResponseEntity; +import org.springframework.http.*; import org.springframework.web.client.RestTemplate; import java.text.SimpleDateFormat; @@ -44,7 +43,7 @@ public class CrawlParser { public static Book parseBook(RuleBean ruleBean, String bookId) { Book book = new Book(); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); - String bookDetailHtml = getByHttpClient(bookDetailUrl); + String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl); if (bookDetailHtml != null) { Pattern bookNamePatten = compile(ruleBean.getBookNamePatten()); Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml); @@ -157,7 +156,7 @@ public class CrawlParser { List contentList = new ArrayList<>(); //读取目录 String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId); - String indexListHtml = getByHttpClient(indexListUrl); + String indexListHtml = getByHttpClientWithChrome(indexListUrl); if (indexListHtml != null) { if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){ @@ -189,7 +188,7 @@ public class CrawlParser { String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1)); //查询章节内容 - String contentHtml = getByHttpClient(contentUrl); + String contentHtml = getByHttpClientWithChrome(contentUrl); if (contentHtml != null && !contentHtml.contains("正在手打中")) { String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); content = content.substring(0, content.indexOf(ruleBean.getContentEnd())); @@ -280,6 +279,22 @@ public class CrawlParser { } + private static String getByHttpClientWithChrome(String url) { + try { + + String body = HttpUtil.getByHttpClientWithChrome(url); + if(body != null && body.length() < Constants.INVALID_HTML_LENGTH){ + return processErrorHttpResult(url); + } + //成功获得html内容 + return body; + } catch (Exception e) { + e.printStackTrace(); + } + return processErrorHttpResult(url); + + } + @SneakyThrows private static String processErrorHttpResult(String url){ Integer count = retryCount.get(); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index aba2c1d..efcfee0 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -30,6 +30,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClient; +import static com.java2nb.novel.core.utils.HttpUtil.getByHttpClientWithChrome; import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlBookId; import static com.java2nb.novel.mapper.BookDynamicSqlSupport.crawlSourceId; import static com.java2nb.novel.mapper.CrawlSourceDynamicSqlSupport.*; @@ -217,7 +218,7 @@ public class CrawlServiceImpl implements CrawlService { .replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId)) .replace("{page}", page + ""); - String bookListHtml = getByHttpClient(catBookListUrl); + String bookListHtml = getByHttpClientWithChrome(catBookListUrl); if (bookListHtml != null) { Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten()); Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml); diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index ec89413..02833f4 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -117,7 +117,7 @@ 示例:
  • - 示例:<p class=\"review\"> + 示例:<p class="review">
  • 示例:</p>