implement vpn pool

This commit is contained in:
2025-12-11 23:18:04 +01:00
parent 470f0922ed
commit 1bda78897b
14 changed files with 703 additions and 2680 deletions

View File

@@ -11,426 +11,258 @@ use tokio::process::{Child, Command};
use tokio::task::JoinHandle;
use tokio::sync::{Mutex, Semaphore};
use tokio::time::{sleep, timeout, Duration};
use super::vpn_manager::{VpnInstance, VpnPool};
#[cfg(target_os = "windows")]
use super::forcebindip::ForceBindIpManager;
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
pub struct ChromeDriverPool {
instances: Vec<Arc<Mutex<ChromeInstance>>>,
semaphore: Arc<Semaphore>,
vpn_pool: Option<Arc<VpnPool>>,
#[cfg(target_os = "windows")]
forcebindip: Option<Arc<ForceBindIpManager>>,
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
}
impl ChromeDriverPool {
/// Creates a new pool with the specified number of ChromeDriver instances (no VPN).
/// Creates a new pool without any proxy (direct connection).
pub async fn new(pool_size: usize) -> Result<Self> {
Self::new_with_vpn_and_task_limit(pool_size, None, 0).await
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
}
/// Creates a new ChromeDriver pool with task-per-instance tracking.
pub async fn new_with_task_limit(
/// Creates a new pool with task-per-instance limit but no proxy.
pub async fn new_with_task_limit(pool_size: usize, max_tasks_per_instance: usize) -> Result<Self> {
Self::new_with_proxy_and_task_limit(pool_size, None, max_tasks_per_instance).await
}
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
pub async fn new_with_proxy(
pool_size: usize,
max_tasks_per_instance: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
) -> Result<Self> {
Self::new_with_vpn_and_task_limit(pool_size, None, max_tasks_per_instance).await
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
}
/// Creates a new pool with VPN support.
pub async fn new_with_vpn(
/// Full constructor: supports proxy + task limiting.
pub async fn new_with_proxy_and_task_limit(
pool_size: usize,
vpn_pool: Option<Arc<VpnPool>>,
) -> Result<Self> {
Self::new_with_vpn_and_task_limit(pool_size, vpn_pool, 0).await
}
/// Creates a new pool with VPN support and task-per-instance limits.
pub async fn new_with_vpn_and_task_limit(
pool_size: usize,
vpn_pool: Option<Arc<VpnPool>>,
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
max_tasks_per_instance: usize,
) -> Result<Self> {
let mut instances = Vec::with_capacity(pool_size);
#[cfg(target_os = "windows")]
let forcebindip = if vpn_pool.is_some() {
match ForceBindIpManager::new() {
Ok(manager) => {
crate::util::logger::log_info("✓ ForceBindIP manager initialized").await;
Some(Arc::new(manager))
}
Err(e) => {
crate::util::logger::log_warn(&format!(
"⚠ ForceBindIP not available: {}. Proceeding without IP binding.",
e
)).await;
None
}
}
} else {
None
};
crate::util::logger::log_info(&format!(
"Initializing ChromeDriver pool with {} instances{}{}...",
"Initializing ChromeDriver pool with {} instances{}...",
pool_size,
if vpn_pool.is_some() { " (VPN-enabled)" } else { "" },
if max_tasks_per_instance > 0 { &format!(" (max {} tasks/instance)", max_tasks_per_instance) } else { "" }
)).await;
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" }
))
.await;
for i in 0..pool_size {
// If VPN pool exists, acquire a VPN instance for this ChromeDriver
let vpn_instance = if let Some(ref vp) = vpn_pool {
Some(vp.acquire().await?)
} else {
None
};
let proxy_url = proxy_pool
.as_ref()
.map(|pp| pp.get_proxy_url(i));
#[cfg(target_os = "windows")]
let instance = ChromeInstance::new_with_task_limit(vpn_instance, forcebindip.clone(), max_tasks_per_instance).await?;
#[cfg(not(target_os = "windows"))]
let instance = ChromeInstance::new_with_task_limit(vpn_instance, max_tasks_per_instance).await?;
let instance = ChromeInstance::new(proxy_url, max_tasks_per_instance).await?;
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
instances.push(Arc::new(Mutex::new(instance)));
}
Ok(Self {
instances,
semaphore: Arc::new(Semaphore::new(pool_size)),
vpn_pool,
#[cfg(target_os = "windows")]
forcebindip,
proxy_pool,
})
}
/// Executes a scrape task using an available instance from the pool.
/// Execute a scraping task using an available instance from the pool.
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
where
T: Send + 'static,
F: FnOnce(Client) -> Fut + Send + 'static,
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
Fut: std::future::Future<Output = Result<T>> + Send,
{
// Acquire semaphore permit
let _permit = self
.semaphore
.acquire()
.await
.map_err(|_| anyhow!("Semaphore closed"))?;
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
// Find an available instance (round-robin or first available)
let instance = self.instances[0].clone();
// Round-robin selection
let index = rand::random_range(..self.instances.len());
let instance = self.instances[index].clone();
let mut guard = instance.lock().await;
// Track task count
guard.increment_task_count();
// Get VPN info before creating session
let vpn_info = if let Some(ref vpn) = guard.vpn_instance {
let vpn_guard = vpn.lock().await;
Some(format!("{} ({})",
vpn_guard.hostname(),
vpn_guard.external_ip().unwrap_or("unknown")))
} else {
None
};
// Log task count if limit is set
if guard.max_tasks_per_instance > 0 {
crate::util::logger::log_info(&format!(
"Instance task count: {}/{}",
guard.get_task_count(),
guard.max_tasks_per_instance
)).await;
))
.await;
}
// Create a new session for this task
let client = guard.new_session().await?;
// Release lock while we do the actual scraping
drop(guard);
drop(guard); // release lock early
// Navigate and parse
if let Some(ref info) = vpn_info {
crate::util::logger::log_info(&format!("Scraping {} via VPN: {}", url, info)).await;
}
crate::util::logger::log_info(&format!("Scraping {} ...", url)).await;
client.goto(&url).await.context("Navigation failed")?;
client.goto(&url).await.context("Failed to navigate")?;
let result = timeout(Duration::from_secs(60), parse(client))
let result = timeout(Duration::from_secs(90), parse(client))
.await
.context("Parse function timed out after 60s")??;
// Handle VPN rotation if needed
if let Some(ref vpn_pool) = self.vpn_pool {
let mut guard = instance.lock().await;
if let Some(ref vpn) = guard.vpn_instance {
vpn_pool.rotate_if_needed(vpn.clone()).await?;
guard.reset_task_count(); // Reset task count on VPN rotation
}
}
.context("Parse timeout")??;
Ok(result)
}
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
pub async fn shutdown(&self) -> Result<()> {
for inst in &self.instances {
let mut guard = inst.lock().await;
guard.shutdown().await?;
}
if let Some(pp) = &self.proxy_pool {
pp.shutdown().await?;
crate::util::logger::log_info("All Docker VPN proxy containers stopped").await;
}
Ok(())
}
pub fn get_number_of_instances(&self) -> usize {
self.instances.len()
}
/// Returns whether VPN is enabled for this pool
pub fn is_vpn_enabled(&self) -> bool {
self.vpn_pool.is_some()
}
/// Gracefully shutdown all ChromeDriver instances in the pool.
pub async fn shutdown(&self) -> Result<()> {
crate::util::logger::log_info("Shutting down ChromeDriverPool instances...").await;
for inst in &self.instances {
crate::util::logger::log_info("Shutting down a ChromeDriver instance...").await;
let mut guard = inst.lock().await;
if let Err(e) = guard.shutdown().await {
crate::util::logger::log_warn(&format!("Error shutting down instance: {}", e)).await;
}
}
crate::util::logger::log_info("All ChromeDriver instances shut down").await;
Ok(())
}
}
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
pub struct ChromeInstance {
process: Child,
base_url: String,
vpn_instance: Option<Arc<Mutex<VpnInstance>>>,
process: Child,
stderr_log: Option<JoinHandle<()>>,
task_count: usize,
max_tasks_per_instance: usize,
// Optional join handle for background stderr logging task
stderr_log: Option<JoinHandle<()>>,
proxy_url: Option<String>,
}
impl ChromeInstance {
/// Creates a new ChromeInstance, optionally bound to a VPN IP.
#[cfg(target_os = "windows")]
pub async fn new(
vpn_instance: Option<Arc<Mutex<VpnInstance>>>,
forcebindip: Option<Arc<ForceBindIpManager>>,
) -> Result<Self> {
Self::new_with_task_limit(vpn_instance, forcebindip, 0).await
}
/// Creates a new ChromeInstance with task-per-instance limit, bound to a VPN IP if provided.
#[cfg(target_os = "windows")]
pub async fn new_with_task_limit(
vpn_instance: Option<Arc<Mutex<VpnInstance>>>,
forcebindip: Option<Arc<ForceBindIpManager>>,
max_tasks_per_instance: usize,
) -> Result<Self> {
let bind_ip = if let Some(ref vpn) = vpn_instance {
let vpn_guard = vpn.lock().await;
vpn_guard.external_ip().map(|s| s.to_string())
} else {
None
};
let mut command = if let (Some(ip), Some(fb)) = (&bind_ip, &forcebindip) {
// Use ForceBindIP to bind ChromeDriver to specific VPN IP
crate::util::logger::log_info(&format!("Binding ChromeDriver to VPN IP: {}", ip)).await;
let mut std_cmd = fb.create_bound_command(
ip,
std::path::Path::new("chromedriver-win64/chromedriver.exe"),
&["--port=0"],
);
Command::from(std_cmd)
} else {
let mut cmd = Command::new("chromedriver-win64/chromedriver.exe");
cmd.arg("--port=0");
cmd
};
command.stdout(Stdio::piped()).stderr(Stdio::piped());
let mut process = command
.spawn()
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
let (base_url, stderr_handle) = Self::wait_for_chromedriver_start(&mut process).await?;
pub async fn new(proxy_url: Option<String>, max_tasks_per_instance: usize) -> Result<Self> {
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
Ok(Self {
process,
base_url,
vpn_instance,
process,
stderr_log: Some(stderr_handle),
task_count: 0,
max_tasks_per_instance,
stderr_log: stderr_handle,
proxy_url,
})
}
/// Creates a new ChromeInstance on non-Windows platforms (no ForceBindIP support).
#[cfg(not(target_os = "windows"))]
pub async fn new(vpn_instance: Option<Arc<Mutex<VpnInstance>>>) -> Result<Self> {
Self::new_with_task_limit(vpn_instance, 0).await
}
/// Creates a new ChromeInstance on non-Windows platforms with task-per-instance limit.
#[cfg(not(target_os = "windows"))]
pub async fn new_with_task_limit(vpn_instance: Option<Arc<Mutex<VpnInstance>>>, max_tasks_per_instance: usize) -> Result<Self> {
if vpn_instance.is_some() {
crate::util::logger::log_warn(
"⚠ VPN binding requested but ForceBindIP is not available on this platform"
).await;
}
let mut command = Command::new("chromedriver");
command
.arg("--port=0")
.stdout(Stdio::piped())
.stderr(Stdio::piped());
let mut process = command
.spawn()
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
let (base_url, stderr_handle) = Self::wait_for_chromedriver_start(&mut process).await?;
Ok(Self {
process,
base_url,
vpn_instance,
task_count: 0,
max_tasks_per_instance,
stderr_log: stderr_handle,
})
}
/// Waits for ChromeDriver to start and extracts the listening address.
async fn wait_for_chromedriver_start(process: &mut Child) -> Result<(String, Option<JoinHandle<()>>)> {
let mut stdout =
BufReader::new(process.stdout.take().context("Failed to capture stdout")?).lines();
let stderr_reader = process.stderr.take().context("Failed to capture stderr")?;
let start_time = std::time::Instant::now();
let mut address: Option<String> = None;
let mut success = false;
// Log stderr in background for debugging and return the JoinHandle so we can
// abort/await it during shutdown.
let stderr_handle: JoinHandle<()> = tokio::spawn(async move {
let mut stderr_lines = BufReader::new(stderr_reader).lines();
while let Ok(Some(line)) = stderr_lines.next_line().await {
let trimmed = line.trim();
if !trimmed.is_empty() {
crate::util::logger::log_info(&format!("ChromeDriver stderr: {}", trimmed)).await;
}
}
});
// Wait for address and success (up to 30s)
while start_time.elapsed() < Duration::from_secs(30) {
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout.next_line()).await {
if let Some(addr) = parse_chromedriver_address(&line) {
address = Some(addr.to_string());
}
if line.contains("ChromeDriver was started successfully") {
success = true;
}
if let (Some(addr), true) = (&address, success) {
return Ok((addr.clone(), Some(stderr_handle)));
}
}
sleep(Duration::from_millis(100)).await;
}
// Cleanup on failure
let _ = process.kill().await;
// If we timed out, abort stderr logging task
stderr_handle.abort();
let _ = stderr_handle.await;
Err(anyhow!("Timeout: ChromeDriver did not start within 30 seconds"))
}
/// Creates a new browser session (client) from this ChromeDriver instance.
pub async fn new_session(&self) -> Result<Client> {
ClientBuilder::native()
.capabilities(Self::chrome_args())
.capabilities(self.chrome_args())
.connect(&self.base_url)
.await
.context("Failed to create new session")
.context("Failed to connect to ChromeDriver")
}
/// Increments task counter and returns whether limit has been reached
pub fn increment_task_count(&mut self) -> bool {
if self.max_tasks_per_instance > 0 {
self.task_count += 1;
self.task_count >= self.max_tasks_per_instance
} else {
false
}
pub fn increment_task_count(&mut self) {
self.task_count += 1;
}
/// Resets task counter (called when VPN is rotated)
pub fn reset_task_count(&mut self) {
self.task_count = 0;
}
/// Returns current task count for this instance
pub fn get_task_count(&self) -> usize {
self.task_count
}
/// Gracefully shutdown the chromedriver process and background log tasks.
pub async fn shutdown(&mut self) -> Result<()> {
// Abort and await stderr logging task if present
if let Some(handle) = self.stderr_log.take() {
handle.abort();
let _ = handle.await;
}
// Try to terminate the child process
let _ = self.process.start_kill();
// Await the process to ensure resources are released
let _ = self.process.wait().await;
Ok(())
}
fn chrome_args() -> Map<String, Value> {
let args = serde_json::json!({
/// Spawns the actual `chromedriver` binary and waits for it to become ready.
async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> {
let mut process = Command::new("chromedriver-win64/chromedriver.exe")
.arg("--port=0") // let OS choose free port
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.context("Failed to start chromedriver. Is it in PATH?")?;
let stdout = process.stdout.take().unwrap();
let stderr = process.stderr.take().unwrap();
let stdout_reader = BufReader::new(stdout);
let mut stdout_lines = stdout_reader.lines();
let stderr_reader = BufReader::new(stderr);
let stderr_handle = tokio::spawn(async move {
let mut lines = stderr_reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
let t = line.trim();
if !t.is_empty() {
let _ = crate::util::logger::log_info(&format!("ChromeDriver: {}", t)).await;
}
}
});
let start = tokio::time::Instant::now();
let mut address: Option<String> = None;
while start.elapsed() < Duration::from_secs(30) {
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout_lines.next_line()).await {
if let Some(addr) = parse_chromedriver_address(&line) {
address = Some(addr);
}
if line.contains("ChromeDriver was started successfully") && address.is_some() {
return Ok((address.unwrap(), process, stderr_handle));
}
}
sleep(Duration::from_millis(100)).await;
}
let _ = process.kill().await;
stderr_handle.abort();
Err(anyhow!("ChromeDriver failed to start within 30s"))
}
fn chrome_args(&self) -> Map<String, Value> {
let mut args = vec![
"--headless=new".to_string(),
"--disable-gpu".to_string(),
"--no-sandbox".to_string(),
"--disable-dev-shm-usage".to_string(),
"--disable-infobars".to_string(),
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
"--disable-logging".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
"--window-size=1920,1080".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36".to_string()
];
if let Some(ref proxy) = self.proxy_url {
let proxy = proxy.clone();
let proxy_formatted = format!("--proxy-server={}", proxy);
args.push(proxy_formatted);
}
let caps = serde_json::json!({
"goog:chromeOptions": {
"args": [
"--headless",
"--disable-gpu",
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-infobars",
"--disable-extensions",
"--disable-popup-blocking",
"--disable-notifications",
"--disable-logging",
"--disable-autofill",
"--disable-sync",
"--disable-default-apps",
"--disable-translate",
"--window-size=1920,1080",
"--disable-blink-features=AutomationControlled",
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
],
"args": args,
"excludeSwitches": ["enable-logging", "enable-automation"],
"prefs": {
"profile.default_content_setting_values.notifications": 2
}
}
});
args.as_object()
.expect("Capabilities should be a JSON object")
.clone()
caps.as_object().cloned().unwrap()
}
}