// src/scraper/webdriver.rs use super::helpers::*; use super::hard_reset::HardResetController; use super::docker_vpn_proxy::DockerVpnProxyPool; use crate::Config; use crate::logger; use anyhow::{anyhow, Context, Result}; use fantoccini::{Client, ClientBuilder}; use serde_json::{Map, Value}; use std::pin::Pin; use std::process::Stdio; use std::sync::Arc; use std::time::Instant; use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::{Child, Command}; use tokio::task::JoinHandle; use tokio::sync::{Mutex, Semaphore}; use tokio::time::{sleep, timeout, Duration}; /// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding. pub struct ChromeDriverPool { instances: Vec>>, semaphore: Arc, /// Optional Docker-based proxy pool (one proxy per Chrome instance) proxy_pool: Option>, /// Whether rotation is enabled (uses half of instances at a time) rotation_enabled: bool, /// Index for round-robin instance selection (when rotation is enabled) next_instance: Arc>, last_request_time: Arc>, min_request_interval_ms: u64, monitoring: Option, hard_reset_controller: Arc, config: Arc, } impl ChromeDriverPool { /// When consecutive errors reach this value, execute() will return a special error /// that signals the caller to trigger a hard reset const HARD_RESET_ERROR_THRESHOLD: usize = 12; /// Creates a new pool without any proxy (direct connection). pub async fn _new(config: &Config, monitoring: Option,) -> Result { Self::new_with_proxy_and_task_limit(None, config, monitoring).await } /// Creates a new pool with task-per-instance limit but no proxy. pub async fn _new_with_task_limit(config: &Config, monitoring: Option,) -> Result { Self::new_with_proxy_and_task_limit(None, config, monitoring).await } /// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool. pub async fn new_with_proxy( proxy_pool: Option>, config: &Config, monitoring: Option, ) -> Result { Self::new_with_proxy_and_task_limit(proxy_pool, config, monitoring).await } /// Full constructor: supports proxy + task limiting + rotation. /// /// When rotation is enabled, only half of the instances are used at once, /// rotating to the other half when task limits are reached. /// /// The actual pool_size is constrained by: /// - max_parallel_instances from config (pool_size_limit parameter) /// - Available proxies from proxy_pool (if provided) /// /// Uses the minimum of these constraints to determine actual pool size. pub async fn new_with_proxy_and_task_limit( proxy_pool: Option>, config: &Config, monitoring: Option, ) -> Result { let pool_size_limit = config.max_parallel_instances; let task_per_instance_limit = config.max_tasks_per_instance; // Determine actual pool size based on available resources let actual_pool_size = if let Some(ref pp) = proxy_pool { let available_proxies = pp.num_proxies(); pool_size_limit.min(available_proxies) } else { pool_size_limit }; if actual_pool_size == 0 { return Err(anyhow!("Pool size must be at least 1")); } // Rotation is enabled when task limiting is active let rotation_enabled = task_per_instance_limit > 0; let half_size = if rotation_enabled { (actual_pool_size + 1) / 2 // Round up for odd numbers } else { actual_pool_size }; let mut instances = Vec::with_capacity(actual_pool_size); crate::util::logger::log_info(&format!( "Initializing ChromeDriver pool with {} instances{}{}...", actual_pool_size, if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" }, if rotation_enabled { " with rotation enabled" } else { "" } )) .await; if rotation_enabled && actual_pool_size < 2 { crate::util::logger::log_warn( "Rotation enabled but pool has < 2 instances - rotation will be limited" ).await; } for i in 0..actual_pool_size { // Pass the entire proxy_pool and the index let instance = ChromeInstance::new( proxy_pool.clone(), i, config, monitoring.clone(), ).await?; crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await; instances.push(Arc::new(Mutex::new(instance))); } // Emit instance created events for (i, instance) in instances.iter().enumerate() { if let Some(ref mon) = monitoring { let guard = instance.lock().await; // Extract proxy info if available let proxy_info = if let Some(ref pp) = proxy_pool { pp.get_proxy_info(i % pp.num_proxies()) } else { guard.proxy_url.as_ref().and_then(|url| { // Parse proxy URL manually if no pool // Format: socks5://localhost:10801 if let Some(port_str) = url.split(':').last() { if let Ok(port) = port_str.parse::() { return Some(crate::monitoring::ProxyInfo { container_name: format!("proxy-{}", i), ip_address: "127.0.0.1".to_string(), port, status: crate::monitoring::ProxyStatus::Connected, }); } } None }) }; mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated { instance_id: i, max_tasks: guard.max_tasks_per_instance, proxy: proxy_info.clone(), }); // Also emit ProxyConnected event if proxy exists if let Some(ref proxy) = proxy_info { mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected { container_name: proxy.container_name.clone(), ip_address: proxy.ip_address.clone(), port: proxy.port, }); } drop(guard); } } let min_request_interval_ms = config.min_request_interval_ms; let hard_reset_controller = Arc::new(HardResetController::new()); let config_clone = Arc::new(config.clone()); Ok(Self { instances, semaphore: Arc::new(Semaphore::new(half_size)), proxy_pool, rotation_enabled, next_instance: Arc::new(Mutex::new(0)), last_request_time: Arc::new(Mutex::new(Instant::now())), min_request_interval_ms, monitoring, hard_reset_controller, config: config_clone, }) } pub async fn execute(&self, url: String, parse: F) -> Result where T: Send + 'static, F: FnOnce(Client) -> Fut + Send + 'static, Fut: std::future::Future> + Send, { let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?; { let mut last_time = self.last_request_time.lock().await; let elapsed = last_time.elapsed().as_millis() as u64; if elapsed < self.min_request_interval_ms { let wait_ms = self.min_request_interval_ms - elapsed; drop(last_time); sleep(Duration::from_millis(wait_ms)).await; let mut last_time = self.last_request_time.lock().await; *last_time = Instant::now(); } else { *last_time = Instant::now(); } } let instance = if self.rotation_enabled { self.select_instance_with_rotation().await? } else { self.select_instance_round_robin().await }; { let mut inst = instance.lock().await; inst.increment_task_count(); } let index: usize = { let instances = &self.instances; instances.iter().position(|inst| Arc::ptr_eq(inst, &instance)).unwrap_or(0) }; if let Some(ref mon) = self.monitoring { mon.emit(crate::monitoring::MonitoringEvent::TaskStarted { instance_id: index, url: url.clone(), }); mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged { instance_id: index, status: crate::monitoring::InstanceStatusChange::Active, }); }; let mut guard = instance.lock().await; let client = guard.get_or_renew_session().await?; let (task_count, session_requests) = guard.get_session_stats().await; crate::util::logger::log_info(&format!( "Instance {} executing task (tasks: {}/{}, session requests: {})", index, task_count, guard.max_tasks_per_instance, session_requests )).await; drop(guard); let start_time = Instant::now(); // Navigation with timeout let navigation_result = timeout( Duration::from_secs(60), client.goto(&url) ).await; let result = match navigation_result { Ok(Ok(_)) => { if let Some(ref mon) = self.monitoring { mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted { instance_id: index, success: navigation_result.is_ok(), duration_ms: start_time.elapsed().as_millis() as u64, error: navigation_result.as_ref().err().map(|e| e.to_string()), }); mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged { instance_id: index, status: crate::monitoring::InstanceStatusChange::Idle, }); } crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await; // Execute parse function match parse(client).await { Ok(data) => { // SUCCESS: Record and log let prev_count = self.hard_reset_controller.get_count(); self.hard_reset_controller.record_success(); if prev_count > 0 { logger::log_info(&format!( "✓ Success - reset counter cleared (was: {}/{})", prev_count, Self::HARD_RESET_ERROR_THRESHOLD )).await; } Ok(data) } Err(e) => { // PARSE ERROR: Record, check threshold, invalidate session let error_count = self.hard_reset_controller.record_error(); { let mut inst = instance.lock().await; inst.invalidate_current_session().await; } // Enhanced logging with threshold status let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0; logger::log_warn(&format!( "Parse error. Reset counter: {}/{} ({:.0}%)", error_count, Self::HARD_RESET_ERROR_THRESHOLD, threshold_pct )).await; // Check if threshold reached if error_count >= Self::HARD_RESET_ERROR_THRESHOLD { logger::log_error(&format!( "🔴 HARD RESET THRESHOLD REACHED ({}/{})", error_count, Self::HARD_RESET_ERROR_THRESHOLD )).await; return Err(anyhow!( "HARD_RESET_REQUIRED: Parse failed: {}. Threshold reached ({}/{})", e, error_count, Self::HARD_RESET_ERROR_THRESHOLD )); } Err(anyhow!( "Parse failed: {}. Hard reset at {}/{}", e, error_count, Self::HARD_RESET_ERROR_THRESHOLD )) } } } Ok(Err(e)) => { // ❌ NAVIGATION ERROR: Record, check threshold, invalidate session crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await; { let mut inst = instance.lock().await; inst.invalidate_current_session().await; } let error_count = self.hard_reset_controller.record_error(); // Enhanced logging let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0; logger::log_warn(&format!( "Navigation error. Reset counter: {}/{} ({:.0}%)", error_count, Self::HARD_RESET_ERROR_THRESHOLD, threshold_pct )).await; // Check if threshold reached if error_count >= Self::HARD_RESET_ERROR_THRESHOLD { logger::log_error(&format!( "🔴 HARD RESET THRESHOLD REACHED ({}/{})", error_count, Self::HARD_RESET_ERROR_THRESHOLD )).await; return Err(anyhow!( "HARD_RESET_REQUIRED: Navigation failed: {}. Threshold reached ({}/{})", e, error_count, Self::HARD_RESET_ERROR_THRESHOLD )); } Err(anyhow!( "Navigation failed: {}. Hard reset at {}/{}", e, error_count, Self::HARD_RESET_ERROR_THRESHOLD )) } Err(_) => { // ❌ TIMEOUT ERROR: Record, check threshold, invalidate session if let Some(ref mon) = self.monitoring { mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout { instance_id: index, url: url.clone(), }); } let error_count = self.hard_reset_controller.record_error(); crate::util::logger::log_error("Navigation timeout (60s)").await; { let mut inst = instance.lock().await; inst.invalidate_current_session().await; } // Enhanced logging let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0; logger::log_warn(&format!( "Timeout error. Reset counter: {}/{} ({:.0}%)", error_count, Self::HARD_RESET_ERROR_THRESHOLD, threshold_pct )).await; // Check if threshold reached if error_count >= Self::HARD_RESET_ERROR_THRESHOLD { logger::log_error(&format!( "🔴 HARD RESET THRESHOLD REACHED ({}/{})", error_count, Self::HARD_RESET_ERROR_THRESHOLD )).await; return Err(anyhow!( "HARD_RESET_REQUIRED: Navigation timeout. Threshold reached ({}/{})", error_count, Self::HARD_RESET_ERROR_THRESHOLD )); } Err(anyhow!( "Navigation timeout. Hard reset at {}/{}", error_count, Self::HARD_RESET_ERROR_THRESHOLD )) } }; { let mut inst = instance.lock().await; inst.task_count = inst.task_count.saturating_sub(1); } result } /// Simple round-robin instance selection (no rotation) async fn select_instance_round_robin(&self) -> Arc> { let mut next = self.next_instance.lock().await; let index = *next; *next = (*next + 1) % self.instances.len(); drop(next); Arc::clone(&self.instances[index]) } /// Round-robin with half-pool rotation async fn select_instance_with_rotation(&self) -> Result>> { let pool_size = self.instances.len(); let half_size = pool_size / 2; if half_size == 0 { // Pool too small for rotation, fall back to simple round-robin return Ok(self.select_instance_round_robin().await); } let mut next = self.next_instance.lock().await; let current_half_start = (*next / half_size) * half_size; let current_half_end = (current_half_start + half_size).min(pool_size); // Try to find available instance in current half let mut attempts = 0; let max_attempts = half_size * 2; // Try both halves while attempts < max_attempts { let index = current_half_start + (*next % half_size); let instance = &self.instances[index]; // Check if instance can accept more tasks let mut inst = instance.lock().await; let can_accept = inst.get_task_count() < inst.max_tasks_per_instance; drop(inst); if can_accept { *next = (*next + 1) % pool_size; drop(next); if let Some(ref mon) = self.monitoring { mon.emit(crate::monitoring::MonitoringEvent::InstanceSelected { instance_id: index, half: if index < half_size { 1 } else { 2 }, }); } return Ok(Arc::clone(instance)); } // Current half saturated, try other half if attempts == half_size - 1 { logger::log_info("Current half saturated, rotating to other half").await; *next = if current_half_start == 0 { half_size } else { 0 }; } else { *next = (*next + 1) % pool_size; } attempts += 1; } drop(next); // All instances saturated Err(anyhow!("All instances at task capacity")) } pub fn get_reset_controller(&self) -> Arc { Arc::clone(&self.hard_reset_controller) } /// Check if hard reset threshold has been reached pub fn should_perform_hard_reset(&self) -> bool { self.hard_reset_controller.get_count() >= Self::HARD_RESET_ERROR_THRESHOLD } /// Get current error count and threshold for monitoring pub fn get_reset_status(&self) -> (usize, usize) { ( self.hard_reset_controller.get_count(), Self::HARD_RESET_ERROR_THRESHOLD ) } /// Gracefully shut down all ChromeDriver processes and Docker proxy containers. /// ✅ FIXED: Now with proper error propagation and Chrome process cleanup pub async fn shutdown(&self) -> Result<()> { logger::log_info(&format!("Shutting down {} ChromeDriver instances...", self.instances.len())).await; let mut shutdown_errors = Vec::new(); for (i, inst) in self.instances.iter().enumerate() { logger::log_info(&format!(" Shutting down instance {}...", i)).await; let mut guard = inst.lock().await; if let Err(e) = guard.shutdown().await { logger::log_error(&format!(" ✗ Instance {} shutdown error: {}", i, e)).await; shutdown_errors.push(format!("Instance {}: {}", i, e)); } else { logger::log_info(&format!(" ✓ Instance {} shut down", i)).await; } } if let Some(pp) = &self.proxy_pool { logger::log_info("Shutting down proxy pool...").await; if let Err(e) = pp.shutdown().await { logger::log_error(&format!("Proxy pool shutdown error: {}", e)).await; shutdown_errors.push(format!("Proxy pool: {}", e)); } else { logger::log_info("✓ Proxy pool shut down").await; } } if !shutdown_errors.is_empty() { return Err(anyhow!( "Pool shutdown completed with {} error(s): {}", shutdown_errors.len(), shutdown_errors.join("; ") )); } logger::log_info("✓ All ChromeDriver instances shut down successfully").await; Ok(()) } pub fn get_number_of_instances(&self) -> usize { self.instances.len() } /// Returns whether rotation is enabled pub fn is_rotation_enabled(&self) -> bool { self.rotation_enabled } /// Returns the size of each half when rotation is enabled pub fn get_rotation_half_size(&self) -> usize { if self.rotation_enabled { (self.instances.len() + 1) / 2 } else { self.instances.len() } } pub fn get_proxy_pool(&self) -> Option> { self.proxy_pool.clone() } } /// Represents a single instance of chromedriver process, optionally bound to a VPN. pub struct ChromeInstance { base_url: String, process: Child, stderr_log: Option>, task_count: usize, max_tasks_per_instance: usize, proxy_url: Option, current_session: Arc>>, // Current active session session_request_count: Arc>, max_requests_per_session: usize, proxy_pool: Option>, // Reference to the proxy pool current_proxy_index: Arc>, // Current proxy index in use instance_id: usize, monitoring: Option, } impl ChromeInstance { pub async fn new(proxy_pool: Option>, instance_id: usize, config: &Config, monitoring: Option) -> Result { let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?; // Get proxy URL if proxy pool is provided let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(instance_id)); let max_tasks_per_instance = config.max_tasks_per_instance; let max_requests_per_session = config.max_requests_per_session; Ok(Self { base_url, process, stderr_log: Some(stderr_handle), task_count: 0, max_tasks_per_instance, proxy_url, current_session: Arc::new(Mutex::new(None)), session_request_count: Arc::new(Mutex::new(0)), max_requests_per_session, proxy_pool, current_proxy_index: Arc::new(Mutex::new(instance_id)), instance_id, monitoring, }) } pub async fn get_or_renew_session(&mut self) -> Result { let mut session_opt = self.current_session.lock().await; let mut request_count = self.session_request_count.lock().await; // Session renewal conditions: // 1. No session exists // 2. Request limit reached let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session; if needs_renewal { if let Some(ref mon) = self.monitoring { mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged { instance_id: self.instance_id, status: crate::monitoring::InstanceStatusChange::Renewing, }); } // ✅ FIXED: Close old session with proper error handling if let Some(old_session) = session_opt.take() { crate::util::logger::log_info("Closing old session").await; // Try to close gracefully first if let Err(e) = old_session.close().await { logger::log_warn(&format!("Session close failed (may leave Chrome tabs open): {}", e)).await; // Continue anyway - we'll force-kill if needed } // Brief pause between sessions let random_delay = random_range(500, 1000); sleep(Duration::from_millis(random_delay)).await; } // Create new session with fresh User-Agent crate::util::logger::log_info(&format!( "Creating new session (requests in last session: {})", *request_count )).await; let new_session = self.create_fresh_session().await?; *session_opt = Some(new_session.clone()); *request_count = 0; if let Some(ref mon) = self.monitoring { let reason = if *request_count >= self.max_requests_per_session { crate::monitoring::RenewalReason::RequestLimit } else { crate::monitoring::RenewalReason::TaskLimit }; // Get updated proxy info let new_proxy_info = if let Some(ref pp) = self.proxy_pool { let proxy_idx = *self.current_proxy_index.lock().await; pp.get_proxy_info(proxy_idx) } else { self.proxy_url.as_ref().and_then(|url| { if let Some(port_str) = url.split(':').last() { if let Ok(port) = port_str.parse::() { return Some(crate::monitoring::ProxyInfo { container_name: format!("proxy-{}", self.instance_id), ip_address: "127.0.0.1".to_string(), port, status: crate::monitoring::ProxyStatus::Connected, }); } } None }) }; mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed { instance_id: self.instance_id, old_request_count: *request_count, reason: reason, new_proxy: new_proxy_info, }); } Ok(new_session) } else { // Use existing session *request_count += 1; Ok(session_opt.as_ref().unwrap().clone()) } } async fn create_fresh_session(&self) -> Result { let proxy_url = if let Some(ref pool) = self.proxy_pool { let mut proxy_idx = self.current_proxy_index.lock().await; let num_proxies = pool.num_proxies(); // Round-robin through all proxies let selected_proxy = *proxy_idx % num_proxies; *proxy_idx = (*proxy_idx + 1) % num_proxies; let url = pool.get_proxy_url(selected_proxy); logger::log_info(&format!( "Instance {} creating session with proxy {}/{} (rotation)", self.instance_id, selected_proxy, num_proxies )).await; Some(url) } else { self.proxy_url.clone() }; let user_agent = Self::chrome_user_agent(); let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url); let client = ClientBuilder::native() .capabilities(capabilities) .connect(&self.base_url) .await .context("Failed to connect to ChromeDriver")?; // ✅ NEW: Extract and store Chrome PID for cleanup // Chrome process info can be extracted from session info if needed // For now, we rely on killing the process tree Ok(client) } pub async fn invalidate_current_session(&self) { let mut session_opt = self.current_session.lock().await; if let Some(old_session) = session_opt.take() { crate::util::logger::log_info(&format!( "Invalidating broken session for instance {}", self.instance_id )).await; // ✅ FIXED: Proper error handling instead of silent failure if let Err(e) = old_session.close().await { logger::log_warn(&format!( "Failed to close broken session (Chrome tabs may remain): {}", e )).await; } } let mut request_count = self.session_request_count.lock().await; *request_count = 0; } pub fn reset_task_count(&mut self) { self.task_count = 0; } pub async fn get_session_stats(&self) -> (usize, usize) { let request_count = *self.session_request_count.lock().await; (self.task_count, request_count) } pub fn increment_task_count(&mut self) { self.task_count += 1; } pub fn get_task_count(&self) -> usize { self.task_count } /// ✅ FIXED: Proper Chrome + ChromeDriver shutdown with process tree killing pub async fn shutdown(&mut self) -> Result<()> { logger::log_info(&format!("Shutting down ChromeInstance {}...", self.instance_id)).await; // Step 1: Close any active session to signal Chrome to close { let mut session_opt = self.current_session.lock().await; if let Some(session) = session_opt.take() { logger::log_info(" Closing active session...").await; if let Err(e) = session.close().await { logger::log_warn(&format!(" Session close failed: {}", e)).await; } } } // Step 2: Abort stderr logging task if let Some(handle) = self.stderr_log.take() { handle.abort(); let _ = handle.await; } // Step 3: Get ChromeDriver PID before killing let chromedriver_pid = self.process.id(); logger::log_info(&format!(" ChromeDriver PID: {:?}", chromedriver_pid)).await; // Step 4: Kill ChromeDriver and wait if let Err(e) = self.process.start_kill() { logger::log_warn(&format!(" Failed to kill ChromeDriver: {}", e)).await; } // Wait for ChromeDriver to exit (with timeout) match timeout(Duration::from_secs(5), self.process.wait()).await { Ok(Ok(status)) => { logger::log_info(&format!(" ChromeDriver exited with status: {:?}", status)).await; } Ok(Err(e)) => { logger::log_warn(&format!(" Error waiting for ChromeDriver: {}", e)).await; } Err(_) => { logger::log_warn(" ChromeDriver didn't exit within 5s").await; } } // Step 5: ✅ CRITICAL FIX: Force-kill Chrome process tree // On Windows, Chrome doesn't die when ChromeDriver dies if let Some(pid) = chromedriver_pid { logger::log_info(&format!(" Force-killing Chrome process tree for PID {}...", pid)).await; #[cfg(target_os = "windows")] { // Kill entire process tree on Windows let _ = Command::new("taskkill") .args(["/F", "/T", "/PID", &pid.to_string()]) .output() .await; // Also kill any remaining chrome.exe processes let _ = Command::new("taskkill") .args(["/F", "/IM", "chrome.exe"]) .output() .await; } #[cfg(not(target_os = "windows"))] { // Kill process group on Unix let _ = Command::new("pkill") .args(["-P", &pid.to_string()]) .output() .await; } logger::log_info(" ✓ Chrome process tree killed").await; } // Step 6: Wait a moment for processes to fully terminate sleep(Duration::from_millis(500)).await; logger::log_info(&format!("✓ ChromeInstance {} shut down", self.instance_id)).await; Ok(()) } pub fn is_available(&self) -> bool { if self.max_tasks_per_instance == 0 { return true; // No limit } self.task_count < self.max_tasks_per_instance } pub fn tasks_remaining(&self) -> usize { if self.max_tasks_per_instance == 0 { return usize::MAX; } self.max_tasks_per_instance.saturating_sub(self.task_count) } /// Spawns the actual `chromedriver` binary and waits for it to become ready. async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> { let mut process = Command::new("chromedriver-win64/chromedriver.exe") .arg("--port=0") // let OS choose free port .stdout(Stdio::piped()) .stderr(Stdio::piped()) .spawn() .context("Failed to start chromedriver. Is it in PATH?")?; let stdout = process.stdout.take().unwrap(); let stderr = process.stderr.take().unwrap(); let stdout_reader = BufReader::new(stdout); let mut stdout_lines = stdout_reader.lines(); let stderr_reader = BufReader::new(stderr); let stderr_handle = tokio::spawn(async move { let mut lines = stderr_reader.lines(); while let Ok(Some(line)) = lines.next_line().await { let t = line.trim(); if !t.is_empty() { let _ = crate::util::logger::log_info(&format!("ChromeDriver: {}", t)).await; } } }); let start = tokio::time::Instant::now(); let mut address: Option = None; while start.elapsed() < Duration::from_secs(30) { if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout_lines.next_line()).await { if let Some(addr) = parse_chromedriver_address(&line) { address = Some(addr); } if line.contains("ChromeDriver was started successfully") && address.is_some() { return Ok((address.unwrap(), process, stderr_handle)); } } sleep(Duration::from_millis(100)).await; } let _ = process.kill().await; stderr_handle.abort(); Err(anyhow!("ChromeDriver failed to start within 30s")) } fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option) -> Map { let mut args = vec![ "--headless=new".to_string(), "--disable-gpu".to_string(), "--no-sandbox".to_string(), "--disable-dev-shm-usage".to_string(), "--disable-infobars".to_string(), "--disable-extensions".to_string(), "--disable-popup-blocking".to_string(), "--disable-notifications".to_string(), "--disable-autofill".to_string(), "--disable-sync".to_string(), "--disable-default-apps".to_string(), "--disable-translate".to_string(), "--disable-blink-features=AutomationControlled".to_string(), format!("--user-agent={}", user_agent), ]; if let Some(proxy) = proxy_url { args.push(format!("--proxy-server={}", proxy)); } let caps = serde_json::json!({ "goog:chromeOptions": { "args": args, "excludeSwitches": ["enable-logging", "enable-automation"], "prefs": { "profile.default_content_setting_values.notifications": 2 } } }); caps.as_object().cloned().unwrap() } pub fn chrome_user_agent() -> &'static str { static UAS: &[&str] = &[ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36", "Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36", ]; let random_user_agent = choose_random(UAS); random_user_agent } } impl Drop for ChromeInstance { fn drop(&mut self) { // Signal both ChromeDriver and Chrome to terminate let _ = self.process.start_kill(); // Also try to kill Chrome if we know the PID if let Some(pid) = self.process.id() { #[cfg(target_os = "windows")] { // Fire and forget - this is best-effort cleanup let _ = std::process::Command::new("taskkill") .args(["/F", "/T", "/PID", &pid.to_string()]) .output(); } } } } fn parse_chromedriver_address(line: &str) -> Option { if line.contains("Starting ChromeDriver") { if let Some(port_str) = line.split("on port ").nth(1) { if let Some(port) = port_str.split_whitespace().next() { if port.parse::().is_ok() { return Some(format!("http://localhost:{}", port)); } } } } for word in line.split_whitespace() { if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::() { if port > 1024 && port < 65535 && line.to_lowercase().contains("port") { return Some(format!("http://localhost:{}", port)); } } } None } /// Simplified task execution - uses the pool pattern. pub struct ScrapeTask { url: String, parse: Box< dyn FnOnce(Client) -> Pin> + Send>> + Send, >, } impl ScrapeTask { pub fn new(url: String, parse: F) -> Self where F: FnOnce(Client) -> Fut + Send + 'static, Fut: std::future::Future> + Send + 'static, { Self { url, parse: Box::new(move |client| Box::pin(parse(client))), } } pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result { let url = self.url; let parse = self.parse; pool.execute(url, move |client| async move { (parse)(client).await }) .await } }