mirror of
https://github.com/201206030/novel.git
synced 2025-04-27 07:30:50 +00:00
v2.3.0发布
This commit is contained in:
parent
56645720b3
commit
290522ef6d
@ -31,100 +31,58 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
@Override
|
@Override
|
||||||
public void parse() {
|
public void parse() {
|
||||||
|
|
||||||
Map<Integer,Date> cat2Date = bookService.queryLastUpdateTime();
|
for(int page = 1; page<= Constants.UPDATE_PAGES_ONCE; page++) {
|
||||||
Map<Integer,Date> newCat2Date = new HashMap<>();
|
String catBookListUrl = getListPageUrl().replace("{0}", "0").replace("{1}", page+"");
|
||||||
for(int i=1;i<=7;i++) {
|
String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl);
|
||||||
Date lastUpdateTime = cat2Date.get(i);
|
if (forObject != null) {
|
||||||
Date updateTime = null;
|
//解析第一页书籍的数据
|
||||||
int page = 1;
|
Pattern bookPatten = compile(getBookUrlPattern());
|
||||||
do{
|
|
||||||
String catBookListUrl = getListPageUrl().replace("{0}", i+"").replace("{1}", page + "");
|
|
||||||
page++;
|
|
||||||
String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl);
|
|
||||||
if (forObject != null) {
|
|
||||||
//解析第一页书籍的数据
|
|
||||||
Pattern bookPatten = compile(getBookUrlPattern());
|
|
||||||
|
|
||||||
Matcher bookMatcher = bookPatten.matcher(forObject);
|
Matcher bookMatcher = bookPatten.matcher(forObject);
|
||||||
|
|
||||||
boolean isFind = bookMatcher.find();
|
boolean isFind = bookMatcher.find();
|
||||||
Pattern scorePatten = compile(getScorePattern());
|
Pattern scorePatten = compile(getScorePattern());
|
||||||
Matcher scoreMatch = scorePatten.matcher(forObject);
|
Matcher scoreMatch = scorePatten.matcher(forObject);
|
||||||
boolean scoreFind = scoreMatch.find();
|
boolean scoreFind = scoreMatch.find();
|
||||||
|
|
||||||
Pattern bookNamePatten = compile(getBookNamePattern());
|
Pattern bookNamePatten = compile(getBookNamePattern());
|
||||||
|
|
||||||
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
|
Matcher bookNameMatch = bookNamePatten.matcher(forObject);
|
||||||
|
|
||||||
Pattern authorPatten = compile(getAuthorPattern());
|
boolean isBookNameMatch = bookNameMatch.find();
|
||||||
|
|
||||||
Matcher authorMatch = authorPatten.matcher(forObject);
|
while (isFind && scoreFind && isBookNameMatch) {
|
||||||
|
|
||||||
boolean isBookNameMatch = bookNameMatch.find();
|
try {
|
||||||
|
Float score = Float.parseFloat(scoreMatch.group(1));
|
||||||
|
|
||||||
while (isFind && scoreFind && isBookNameMatch && authorMatch.find() && (updateTime==null || updateTime.getTime()>lastUpdateTime.getTime())) {
|
if (score < getLowestScore()) {
|
||||||
|
continue;
|
||||||
try {
|
|
||||||
Float score = Float.parseFloat(scoreMatch.group(1));
|
|
||||||
|
|
||||||
if (score < getLowestScore()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String bokNum = bookMatcher.group(1);
|
|
||||||
String bookUrl = getIndexUrl() + "/" + bokNum + "/";
|
|
||||||
|
|
||||||
String bookName = bookNameMatch.group(1);
|
|
||||||
|
|
||||||
String author = authorMatch.group(1);
|
|
||||||
|
|
||||||
Boolean hasBook = bookService.hasBook(bookName, author);
|
|
||||||
|
|
||||||
if (hasBook) {
|
|
||||||
|
|
||||||
bookService.addBookParseLog(bookUrl, bookName, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
String body = RestTemplateUtil.getBodyByUtf8(bookUrl);
|
|
||||||
if (body != null) {
|
|
||||||
Pattern updateTimePatten = compile(getUpdateTimePattern());
|
|
||||||
Matcher updateTimeMatch = updateTimePatten.matcher(body);
|
|
||||||
if (updateTimeMatch.find()) {
|
|
||||||
String updateTimeStr = updateTimeMatch.group(1);
|
|
||||||
SimpleDateFormat format ;
|
|
||||||
if(updateTimeStr.length()>10){
|
|
||||||
|
|
||||||
format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
|
|
||||||
}else{
|
|
||||||
format = new SimpleDateFormat("yy-MM-dd");
|
|
||||||
}
|
|
||||||
updateTime = format.parse(updateTimeStr);
|
|
||||||
if(!newCat2Date.containsKey(i)) {
|
|
||||||
newCat2Date.put(i, updateTime);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
|
|
||||||
log.error(e.getMessage(), e);
|
|
||||||
|
|
||||||
} finally {
|
|
||||||
bookMatcher.find();
|
|
||||||
isFind = bookMatcher.find();
|
|
||||||
scoreFind = scoreMatch.find();
|
|
||||||
isBookNameMatch = bookNameMatch.find();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String bokNum = bookMatcher.group(1);
|
||||||
|
String bookUrl = getIndexUrl() + "/" + bokNum + "/";
|
||||||
|
|
||||||
|
String bookName = bookNameMatch.group(1);
|
||||||
|
|
||||||
|
bookService.addBookParseLog(bookUrl, bookName, score);
|
||||||
|
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
bookMatcher.find();
|
||||||
|
isFind = bookMatcher.find();
|
||||||
|
scoreFind = scoreMatch.find();
|
||||||
|
isBookNameMatch = bookNameMatch.find();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}while (updateTime == null || updateTime.getTime()>lastUpdateTime.getTime());
|
}
|
||||||
}
|
}
|
||||||
bookService.updateBookUpdateTimeLog(newCat2Date);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,7 +121,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
|
|
||||||
Pattern updateTimePatten = compile(getUpdateTimePattern());
|
Pattern updateTimePatten = compile(getUpdateTimePattern());
|
||||||
Matcher updateTimeMatch = updateTimePatten.matcher(body);
|
Matcher updateTimeMatch = updateTimePatten.matcher(body);
|
||||||
if (updateTimeMatch.find()) {
|
/*if (updateTimeMatch.find()) {
|
||||||
String updateTimeStr = updateTimeMatch.group(1);
|
String updateTimeStr = updateTimeMatch.group(1);
|
||||||
SimpleDateFormat format ;
|
SimpleDateFormat format ;
|
||||||
if(updateTimeStr.length()>10){
|
if(updateTimeStr.length()>10){
|
||||||
@ -172,7 +130,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
}else{
|
}else{
|
||||||
format = new SimpleDateFormat("yy-MM-dd");
|
format = new SimpleDateFormat("yy-MM-dd");
|
||||||
}
|
}
|
||||||
Date updateTime = format.parse(updateTimeStr);
|
Date updateTime = format.parse(updateTimeStr);*/
|
||||||
Pattern picPatten = compile(getPicPattern());
|
Pattern picPatten = compile(getPicPattern());
|
||||||
Matcher picMather = picPatten.matcher(body);
|
Matcher picMather = picPatten.matcher(body);
|
||||||
if (picMather.find()) {
|
if (picMather.find()) {
|
||||||
@ -189,7 +147,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
book.setScore(score > 10 ? 8.0f : score);
|
book.setScore(score > 10 ? 8.0f : score);
|
||||||
book.setPicUrl(picSrc);
|
book.setPicUrl(picSrc);
|
||||||
book.setBookStatus(status);
|
book.setBookStatus(status);
|
||||||
book.setUpdateTime(updateTime);
|
book.setUpdateTime(new Date());
|
||||||
|
|
||||||
List<BookIndex> indexList = new ArrayList<>();
|
List<BookIndex> indexList = new ArrayList<>();
|
||||||
List<BookContent> contentList = new ArrayList<>();
|
List<BookContent> contentList = new ArrayList<>();
|
||||||
@ -261,7 +219,7 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
//}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user