feat: 增加 HTTP 代理配置,助力爬虫采集

This commit is contained in:
xiongxiaoyang 2022-07-14 22:14:11 +08:00
parent 0b728b9fe5
commit ba272bd89a
6 changed files with 92 additions and 30 deletions

View File

@ -211,12 +211,13 @@ novel-plus -- 父工程
2. 使用`unzip novel-crawl.zip`命令解压 novel-crawl.zip
3. 修改 `config/application-common-prod.yml` 文件中的数据库配置
4. 修改 `config/application-common-prod.yml` 文件中的管理员账号密码
5. novel-crawl 目录下使用`bin/novel-crawl.sh start`命令启动爬虫程序
6. 打开浏览器默认`8083`端口访问
7. 选择已有或新增爬虫源支持自定义爬虫规则点击`开启`按钮开始采集小说数据
8. novel-crawl 目录下使用`bin/novel-crawl.sh stop`命令停止爬虫程序
9. novel-crawl 目录下使用`bin/novel-crawl.sh restart`命令重启爬虫程序
10. novel-crawl 目录下使用`bin/novel-crawl.sh status`命令查看爬虫程序的运行状态
5. 修改 `config/application-common-prod.yml` 文件中的 HTTP 代理配置
6. novel-crawl 目录下使用`bin/novel-crawl.sh start`命令启动爬虫程序
7. 打开浏览器默认`8083`端口访问
8. 选择已有或新增爬虫源支持自定义爬虫规则点击`开启`按钮开始采集小说数据
9. novel-crawl 目录下使用`bin/novel-crawl.sh stop`命令停止爬虫程序
10. novel-crawl 目录下使用`bin/novel-crawl.sh restart`命令重启爬虫程序
11. novel-crawl 目录下使用`bin/novel-crawl.sh status`命令查看爬虫程序的运行状态
- 前台安装

View File

@ -0,0 +1,22 @@
package com.java2nb.novel.core.config;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
/**
* @author xiongxiaoyang
* @date 2022/7/14
*/
@Data
@Component
@ConfigurationProperties(prefix = "http.proxy")
public class HttpProxyProperties {
private Boolean enabled;
private String ip;
private Integer port;
}

View File

@ -1,6 +1,8 @@
package com.java2nb.novel.core.utils;
import com.java2nb.novel.core.config.HttpProxyProperties;
import lombok.SneakyThrows;
import org.apache.http.HttpHost;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
@ -8,37 +10,46 @@ import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
import org.springframework.http.converter.HttpMessageConverter;
import org.springframework.http.converter.StringHttpMessageConverter;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestTemplate;
import javax.net.ssl.SSLContext;
import java.nio.charset.Charset;
import java.security.cert.X509Certificate;
import java.util.List;
import java.util.Objects;
@Component
public class RestTemplateUtil {
private static HttpProxyProperties httpProxyProperties;
RestTemplateUtil(HttpProxyProperties properties) {
httpProxyProperties = properties;
}
@SneakyThrows
public static RestTemplate getInstance(String charset) {
TrustStrategy acceptingTrustStrategy = (X509Certificate[] chain, String authType) -> true;
//忽略证书
SSLContext sslContext = org.apache.http.ssl.SSLContexts.custom()
.loadTrustMaterial(null, acceptingTrustStrategy)
.build();
.loadTrustMaterial(null, acceptingTrustStrategy)
.build();
SSLConnectionSocketFactory csf = new SSLConnectionSocketFactory(sslContext);
Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.getSocketFactory())
.register("https", csf)
.build();
.register("http", PlainConnectionSocketFactory.getSocketFactory())
.register("https", csf)
.build();
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(registry);
//连接池的最大连接数0代表不限如果取0需要考虑连接泄露导致系统崩溃的后果
@ -46,22 +57,26 @@ public class RestTemplateUtil {
//每个路由的最大连接数,如果只调用一个地址,可以将其设置为最大连接数
connectionManager.setDefaultMaxPerRoute(300);
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager(connectionManager)
.build();
HttpClientBuilder clientBuilder = HttpClients.custom();
if (Objects.nonNull(httpProxyProperties) && Boolean.TRUE.equals(httpProxyProperties.getEnabled())) {
HttpHost proxy = new HttpHost(httpProxyProperties.getIp(), httpProxyProperties.getPort());
clientBuilder.setProxy(proxy);
}
CloseableHttpClient httpClient = clientBuilder.setConnectionManager(connectionManager)
.build();
HttpComponentsClientHttpRequestFactory requestFactory =
new HttpComponentsClientHttpRequestFactory();
new HttpComponentsClientHttpRequestFactory();
requestFactory.setHttpClient(httpClient);
requestFactory.setConnectionRequestTimeout(3000);
requestFactory.setConnectTimeout(3000);
requestFactory.setReadTimeout(30000);
RestTemplate restTemplate = new RestTemplate(requestFactory);
List<HttpMessageConverter<?>> list = restTemplate.getMessageConverters();
for (HttpMessageConverter<?> httpMessageConverter : list) {
if(httpMessageConverter instanceof StringHttpMessageConverter) {
if (httpMessageConverter instanceof StringHttpMessageConverter) {
((StringHttpMessageConverter) httpMessageConverter).setDefaultCharset(Charset.forName(charset));
break;
}

View File

@ -1,6 +1,6 @@
spring:
profiles:
include: [common]
include: [ common ]
main:
allow-bean-definition-overriding: true
#Redis服务器IP
@ -54,24 +54,30 @@ sharding:
props:
sql.show: true
tables:
book_content: #book_content表
book_content: #book_content表
key-generator-column-name: id #主键
actual-data-nodes: ds${0}.book_content${0..9} #数据节点
# database-strategy: #分库策略
# inline:
# sharding-column: book_id
# algorithm-expression: ds${book_id % 10}
table-strategy: #分表策略
table-strategy: #分表策略
inline:
shardingColumn: index_id
algorithm-expression: book_content${index_id % 10}
content:
save:
storage: db #存储介质db数据库filetxt文本
path: /Users/xiongxiaoyang/books #txt小说文本保存路径
path: /Users/xiongxiaoyang/books #txt小说文本保存路径
# HTTP 代理配置
http:
proxy:
# 是否开启 HTTP 代理true-开启false-不开启
enabled: false
# 代理 IP
ip: u493.kdltps.com
# 代理端口号
port: 15818

View File

@ -1,6 +1,6 @@
spring:
profiles:
include: [common]
include: [ common ]
main:
allow-bean-definition-overriding: true
#Redis服务器IP
@ -54,14 +54,14 @@ sharding:
props:
sql.show: true
tables:
book_content: #book_content表
book_content: #book_content表
key-generator-column-name: id #主键
actual-data-nodes: ds${0}.book_content${0..9} #数据节点
# database-strategy: #分库策略
# inline:
# sharding-column: book_id
# algorithm-expression: ds${book_id % 10}
table-strategy: #分表策略
table-strategy: #分表策略
inline:
shardingColumn: index_id
algorithm-expression: book_content${index_id % 10}
@ -79,7 +79,15 @@ content:
storage: db #存储介质db数据库filetxt文本
path: /Users/xiongxiaoyang/books #txt小说文本保存路径
# HTTP 代理配置
http:
proxy:
# 是否开启 HTTP 代理true-开启false-不开启
enabled: false
# 代理 IP
ip: 40.83.102.86
# 代理端口号
port: 80

View File

@ -36,4 +36,14 @@ crawl:
content:
save:
storage: db #存储介质db数据库filetxt文本
path: /Users/xiongxiaoyang/books #txt小说文本保存路径
path: /Users/xiongxiaoyang/books #txt小说文本保存路径
# HTTP 代理配置
http:
proxy:
# 是否开启 HTTP 代理true-开启false-不开启
enabled: false
# 代理 IP
ip: u493.kdltps.com
# 代理端口号
port: 15818