From cca73526fb2cf124389f92b04ab25de4e54ce8d2 Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <773861846@qq.com> Date: Sat, 14 Dec 2019 09:36:24 +0800 Subject: [PATCH] =?UTF-8?q?=E5=90=8E=E5=8F=B0=E7=AE=A1=E7=90=86=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E5=8A=A0=E5=85=A5=E4=BA=8B=E7=89=A9=EF=BC=8C=E8=A7=A3?= =?UTF-8?q?=E5=86=B3=E9=83=A8=E5=88=86=E7=94=A8=E6=88=B7=E7=AB=A0=E8=8A=82?= =?UTF-8?q?=E5=92=8C=E5=86=85=E5=AE=B9=E4=B8=8D=E5=8C=B9=E9=85=8D=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .../service/impl/BookCrawlServiceImpl.java | 111 ++++++++---------- .../java2nb/books/util/RestTemplateUtil.java | 21 +++- novel-front/novel-front.iml | 110 ----------------- 4 files changed, 66 insertions(+), 177 deletions(-) create mode 100644 .gitignore delete mode 100644 novel-front/novel-front.iml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2b10a2b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/novel-front/novel-front.iml diff --git a/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java b/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java index 23f9e49..b128d7f 100644 --- a/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java +++ b/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java @@ -22,8 +22,11 @@ import java.util.regex.Pattern; import com.java2nb.books.dao.BookCrawlDao; import com.java2nb.books.domain.BookCrawlDO; import com.java2nb.books.service.BookCrawlService; +import org.springframework.transaction.annotation.Transactional; import org.springframework.web.client.RestTemplate; +import static java.util.regex.Pattern.*; + @Service public class BookCrawlServiceImpl implements BookCrawlService { @@ -181,7 +184,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { String bookListUrl = "http://book.sfacg.com/List/default.aspx?&tid=" + catId + "&if=1&PageIndex=" + page; - String forObject = getByHttpClient(bookListUrl); + String forObject = getByTemplate(bookListUrl); if (forObject != null) { Pattern bookPatten = Pattern.compile("href=\"/Novel/(\\d+)/\""); @@ -195,7 +198,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { } long bookNum = Long.parseLong(bookMatcher.group(1)); String bookUrl = "http://book.sfacg.com/Novel/" + bookNum; - String forObject1 = getByHttpClient(bookUrl); + String forObject1 = getByTemplate(bookUrl); if (forObject1 != null) { Pattern updateTimePatten = Pattern.compile("更新:(\\d+/\\d+/\\d+ \\d+:\\d+:\\d+)"); Matcher updateTimeMatch = updateTimePatten.matcher(forObject1); @@ -329,7 +332,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { //读取目录 String indexUrl = "http://book.sfacg.com/Novel/" + bookNum + "/MainIndex/"; - String forObject2 = getByHttpClient(indexUrl); + String forObject2 = getByTemplate(indexUrl); if (forObject2 != null) { Pattern indexListPatten = Pattern.compile("href=\"(/Novel/\\d+/\\d+/\\d+/)\"\\s+title=\"([^\"]+)\\s*"); Matcher indexListMatch = indexListPatten.matcher(forObject2); @@ -352,7 +355,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { //查询章节内容 - String forObject3 = getByHttpClient(contentUrl); + String forObject3 = getByTemplate(contentUrl); if (forObject3 != null && !forObject3.contains("内容整改中,请等待")) { String content = forObject3.substring(forObject3.indexOf("
") + 6); @@ -413,13 +416,13 @@ public class BookCrawlServiceImpl implements BookCrawlService { catBookListUrlBase = baseUrl + "/lhb/"; } //拼接分类URL - int page = 1;//起始页码 + int page = 1; int totalPage = page; String catBookListUrl = catBookListUrlBase + i + "/" + page + ".html"; - String forObject = getByHttpClient(catBookListUrl); + String forObject = getByTemplate(catBookListUrl); if (forObject != null) { //匹配分页数 - Pattern pattern = Pattern.compile("value=\"(\\d+)/(\\d+)\""); + Pattern pattern = compile("value=\"(\\d+)/(\\d+)\""); Matcher matcher = pattern.matcher(forObject); boolean isFind = matcher.find(); System.out.println("匹配分页数" + isFind); @@ -427,7 +430,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { int currentPage = Integer.parseInt(matcher.group(1)); totalPage = Integer.parseInt(matcher.group(2)); //解析第一页书籍的数据 - Pattern bookPatten = Pattern.compile("href=\"/(\\d+_\\d+)/\""); + Pattern bookPatten = compile("href=\"/(\\d+_\\d+)/\""); parseBiquTaBook(bookPatten, forObject, i, baseUrl); while (currentPage < totalPage) { if (isInteruptBiquTaCrawl) { @@ -435,7 +438,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { } catBookListUrl = catBookListUrlBase + i + "/" + (currentPage + 1) + ".html"; - forObject = getByHttpClient(catBookListUrl); + forObject = getByTemplate(catBookListUrl); if (forObject != null) { //匹配分页数 matcher = pattern.matcher(forObject); @@ -457,15 +460,15 @@ public class BookCrawlServiceImpl implements BookCrawlService { private void parseBiquTaBook(Pattern bookPatten, String forObject, int catNum, String baseUrl) { Matcher matcher2 = bookPatten.matcher(forObject); boolean isFind = matcher2.find(); - Pattern scorePatten = Pattern.compile("(\\d+\\.\\d+)分
"); + Pattern scorePatten = compile("(\\d+\\.\\d+)分"); Matcher scoreMatch = scorePatten.matcher(forObject); boolean scoreFind = scoreMatch.find(); - Pattern bookNamePatten = Pattern.compile("

([^/]+)

"); + Pattern bookNamePatten = compile("

([^/]+)

"); Matcher bookNameMatch = bookNamePatten.matcher(forObject); boolean isBookNameMatch = bookNameMatch.find(); - Pattern authorPatten = Pattern.compile(">作者:([^/]+)<"); + Pattern authorPatten = compile(">作者:([^/]+)<"); Matcher authoreMatch = authorPatten.matcher(forObject); boolean isFindAuthor = authoreMatch.find(); @@ -498,13 +501,13 @@ public class BookCrawlServiceImpl implements BookCrawlService { String bokNum = matcher2.group(1); String bookUrl = baseUrl + "/" + bokNum + "/"; - String body = getByHttpClient(bookUrl); + String body = getByTemplate(bookUrl); if (body != null) { - Pattern statusPatten = Pattern.compile("状态:([^/]+)"); + Pattern statusPatten = compile("状态:([^/]+)"); Matcher statusMatch = statusPatten.matcher(body); if (statusMatch.find()) { String status = statusMatch.group(1); - Pattern updateTimePatten = Pattern.compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); + Pattern updateTimePatten = compile("更新:(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"); Matcher updateTimeMatch = updateTimePatten.matcher(body); if (updateTimeMatch.find()) { String updateTimeStr = updateTimeMatch.group(1); @@ -513,12 +516,12 @@ public class BookCrawlServiceImpl implements BookCrawlService { if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) { continue; } - Pattern picPatten = Pattern.compile("]+)\"\\s+onerror=\"this.src="); + Pattern picPatten = compile("]+)\"\\s+onerror=\"this.src="); Matcher picMather = picPatten.matcher(body); if (picMather.find()) { String picSrc = picMather.group(1); - Pattern descPatten = Pattern.compile("class=\"review\">([^<]+)

"); + Pattern descPatten = compile("class=\"review\">([^<]+)

"); Matcher descMatch = descPatten.matcher(body); if (descMatch.find()) { String desc = descMatch.group(1); @@ -538,13 +541,13 @@ public class BookCrawlServiceImpl implements BookCrawlService { List contentList = new ArrayList<>(); //读取目录 - Pattern indexPatten = Pattern.compile("查看完整目录"); + Pattern indexPatten = compile("查看完整目录"); Matcher indexMatch = indexPatten.matcher(body); if (indexMatch.find()) { String indexUrl = baseUrl + indexMatch.group(1); - String body2 = getByHttpClient(indexUrl); + String body2 = getByTemplate(indexUrl); if (body2 != null) { - Pattern indexListPatten = Pattern.compile("([^/]+)"); + Pattern indexListPatten = compile("([^/]+)"); Matcher indexListMatch = indexListPatten.matcher(body2); boolean isFindIndex = indexListMatch.find(); @@ -565,7 +568,7 @@ public class BookCrawlServiceImpl implements BookCrawlService { //查询章节内容 - String body3 = getByHttpClient(contentUrl.replace("//m.","//www.")); + String body3 = getByTemplate(contentUrl.replace("//m.","//www.")); if (body3 != null) { String start = "id=\"content\">"; String end = "