Files
WebScraper/src/scraper/webdriver.rs
2025-12-23 15:07:40 +01:00

1058 lines
40 KiB
Rust

// src/scraper/webdriver.rs
use super::helpers::*;
use super::hard_reset::HardResetController;
use super::docker_vpn_proxy::DockerVpnProxyPool;
use crate::Config;
use crate::logger;
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, ClientBuilder};
use serde_json::{Map, Value};
use std::pin::Pin;
use std::process::Stdio;
use std::sync::Arc;
use std::time::Instant;
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::process::{Child, Command};
use tokio::task::JoinHandle;
use tokio::sync::{Mutex, Semaphore};
use tokio::time::{sleep, timeout, Duration};
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
pub struct ChromeDriverPool {
instances: Vec<Arc<Mutex<ChromeInstance>>>,
semaphore: Arc<Semaphore>,
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
/// Whether rotation is enabled (uses half of instances at a time)
rotation_enabled: bool,
/// Index for round-robin instance selection (when rotation is enabled)
next_instance: Arc<Mutex<usize>>,
last_request_time: Arc<Mutex<Instant>>,
min_request_interval_ms: u64,
monitoring: Option<crate::monitoring::MonitoringHandle>,
hard_reset_controller: Arc<HardResetController>,
config: Arc<Config>,
}
impl ChromeDriverPool {
/// When consecutive errors reach this value, execute() will return a special error
/// that signals the caller to trigger a hard reset
const HARD_RESET_ERROR_THRESHOLD: usize = 12;
/// Creates a new pool without any proxy (direct connection).
pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
}
/// Creates a new pool with task-per-instance limit but no proxy.
pub async fn _new_with_task_limit(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
}
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
pub async fn new_with_proxy(
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
config: &Config,
monitoring: Option<crate::monitoring::MonitoringHandle>,
) -> Result<Self> {
Self::new_with_proxy_and_task_limit(proxy_pool, config, monitoring).await
}
/// Full constructor: supports proxy + task limiting + rotation.
///
/// When rotation is enabled, only half of the instances are used at once,
/// rotating to the other half when task limits are reached.
///
/// The actual pool_size is constrained by:
/// - max_parallel_instances from config (pool_size_limit parameter)
/// - Available proxies from proxy_pool (if provided)
///
/// Uses the minimum of these constraints to determine actual pool size.
pub async fn new_with_proxy_and_task_limit(
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
config: &Config,
monitoring: Option<crate::monitoring::MonitoringHandle>,
) -> Result<Self> {
let pool_size_limit = config.max_parallel_instances;
let task_per_instance_limit = config.max_tasks_per_instance;
// Determine actual pool size based on available resources
let actual_pool_size = if let Some(ref pp) = proxy_pool {
let available_proxies = pp.num_proxies();
pool_size_limit.min(available_proxies)
} else {
pool_size_limit
};
if actual_pool_size == 0 {
return Err(anyhow!("Pool size must be at least 1"));
}
// Rotation is enabled when task limiting is active
let rotation_enabled = task_per_instance_limit > 0;
let half_size = if rotation_enabled {
(actual_pool_size + 1) / 2 // Round up for odd numbers
} else {
actual_pool_size
};
let mut instances = Vec::with_capacity(actual_pool_size);
crate::util::logger::log_info(&format!(
"Initializing ChromeDriver pool with {} instances{}{}...",
actual_pool_size,
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" },
if rotation_enabled { " with rotation enabled" } else { "" }
))
.await;
if rotation_enabled && actual_pool_size < 2 {
crate::util::logger::log_warn(
"Rotation enabled but pool has < 2 instances - rotation will be limited"
).await;
}
for i in 0..actual_pool_size {
// Pass the entire proxy_pool and the index
let instance = ChromeInstance::new(
proxy_pool.clone(),
i,
config,
monitoring.clone(),
).await?;
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
instances.push(Arc::new(Mutex::new(instance)));
}
// Emit instance created events
for (i, instance) in instances.iter().enumerate() {
if let Some(ref mon) = monitoring {
let guard = instance.lock().await;
// Extract proxy info if available
let proxy_info = if let Some(ref pp) = proxy_pool {
pp.get_proxy_info(i % pp.num_proxies())
} else {
guard.proxy_url.as_ref().and_then(|url| {
// Parse proxy URL manually if no pool
// Format: socks5://localhost:10801
if let Some(port_str) = url.split(':').last() {
if let Ok(port) = port_str.parse::<u16>() {
return Some(crate::monitoring::ProxyInfo {
container_name: format!("proxy-{}", i),
ip_address: "127.0.0.1".to_string(),
port,
status: crate::monitoring::ProxyStatus::Connected,
});
}
}
None
})
};
mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated {
instance_id: i,
max_tasks: guard.max_tasks_per_instance,
proxy: proxy_info.clone(),
});
// Also emit ProxyConnected event if proxy exists
if let Some(ref proxy) = proxy_info {
mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected {
container_name: proxy.container_name.clone(),
ip_address: proxy.ip_address.clone(),
port: proxy.port,
});
}
drop(guard);
}
}
let min_request_interval_ms = config.min_request_interval_ms;
let hard_reset_controller = Arc::new(HardResetController::new());
let config_clone = Arc::new(config.clone());
Ok(Self {
instances,
semaphore: Arc::new(Semaphore::new(half_size)),
proxy_pool,
rotation_enabled,
next_instance: Arc::new(Mutex::new(0)),
last_request_time: Arc::new(Mutex::new(Instant::now())),
min_request_interval_ms,
monitoring,
hard_reset_controller,
config: config_clone,
})
}
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
where
T: Send + 'static,
F: FnOnce(Client) -> Fut + Send + 'static,
Fut: std::future::Future<Output = Result<T>> + Send,
{
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
{
let mut last_time = self.last_request_time.lock().await;
let elapsed = last_time.elapsed().as_millis() as u64;
if elapsed < self.min_request_interval_ms {
let wait_ms = self.min_request_interval_ms - elapsed;
drop(last_time);
sleep(Duration::from_millis(wait_ms)).await;
let mut last_time = self.last_request_time.lock().await;
*last_time = Instant::now();
} else {
*last_time = Instant::now();
}
}
let instance = if self.rotation_enabled {
self.select_instance_with_rotation().await?
} else {
self.select_instance_round_robin().await
};
{
let mut inst = instance.lock().await;
inst.increment_task_count();
}
let index: usize = {
let instances = &self.instances;
instances.iter().position(|inst| Arc::ptr_eq(inst, &instance)).unwrap_or(0)
};
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::TaskStarted {
instance_id: index,
url: url.clone(),
});
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
instance_id: index,
status: crate::monitoring::InstanceStatusChange::Active,
});
};
let mut guard = instance.lock().await;
let client = guard.get_or_renew_session().await?;
let (task_count, session_requests) = guard.get_session_stats().await;
crate::util::logger::log_info(&format!(
"Instance {} executing task (tasks: {}/{}, session requests: {})",
index, task_count, guard.max_tasks_per_instance, session_requests
)).await;
drop(guard);
let start_time = Instant::now();
// Navigation with timeout
let navigation_result = timeout(
Duration::from_secs(60),
client.goto(&url)
).await;
let result = match navigation_result {
Ok(Ok(_)) => {
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted {
instance_id: index,
success: navigation_result.is_ok(),
duration_ms: start_time.elapsed().as_millis() as u64,
error: navigation_result.as_ref().err().map(|e| e.to_string()),
});
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
instance_id: index,
status: crate::monitoring::InstanceStatusChange::Idle,
});
}
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
// Execute parse function
match parse(client).await {
Ok(data) => {
// ✅ SUCCESS: Record and log
let prev_count = self.hard_reset_controller.get_count();
self.hard_reset_controller.record_success();
if prev_count > 0 {
logger::log_info(&format!(
"✓ Success - reset counter cleared (was: {}/{})",
prev_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
}
Ok(data)
}
Err(e) => {
// ❌ PARSE ERROR: Record, check threshold, invalidate session
let error_count = self.hard_reset_controller.record_error();
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
// Enhanced logging with threshold status
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Parse error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Parse failed: {}. Threshold reached ({}/{})",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Parse failed: {}. Hard reset at {}/{}",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
}
}
}
Ok(Err(e)) => {
// ❌ NAVIGATION ERROR: Record, check threshold, invalidate session
crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await;
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
let error_count = self.hard_reset_controller.record_error();
// Enhanced logging
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Navigation error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Navigation failed: {}. Threshold reached ({}/{})",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Navigation failed: {}. Hard reset at {}/{}",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
}
Err(_) => {
// ❌ TIMEOUT ERROR: Record, check threshold, invalidate session
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout {
instance_id: index,
url: url.clone(),
});
}
let error_count = self.hard_reset_controller.record_error();
crate::util::logger::log_error("Navigation timeout (60s)").await;
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
// Enhanced logging
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Timeout error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Navigation timeout. Threshold reached ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Navigation timeout. Hard reset at {}/{}",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
}
};
{
let mut inst = instance.lock().await;
inst.task_count = inst.task_count.saturating_sub(1);
}
result
}
/// Simple round-robin instance selection (no rotation)
async fn select_instance_round_robin(&self) -> Arc<Mutex<ChromeInstance>> {
let mut next = self.next_instance.lock().await;
let index = *next;
*next = (*next + 1) % self.instances.len();
drop(next);
Arc::clone(&self.instances[index])
}
/// Round-robin with half-pool rotation
async fn select_instance_with_rotation(&self) -> Result<Arc<Mutex<ChromeInstance>>> {
let pool_size = self.instances.len();
let half_size = pool_size / 2;
if half_size == 0 {
// Pool too small for rotation, fall back to simple round-robin
return Ok(self.select_instance_round_robin().await);
}
let mut next = self.next_instance.lock().await;
let current_half_start = (*next / half_size) * half_size;
let current_half_end = (current_half_start + half_size).min(pool_size);
// Try to find available instance in current half
let mut attempts = 0;
let max_attempts = half_size * 2; // Try both halves
while attempts < max_attempts {
let index = current_half_start + (*next % half_size);
let instance = &self.instances[index];
// Check if instance can accept more tasks
let mut inst = instance.lock().await;
let can_accept = inst.get_task_count() < inst.max_tasks_per_instance;
drop(inst);
if can_accept {
*next = (*next + 1) % pool_size;
drop(next);
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::InstanceSelected {
instance_id: index,
half: if index < half_size { 1 } else { 2 },
});
}
return Ok(Arc::clone(instance));
}
// Current half saturated, try other half
if attempts == half_size - 1 {
logger::log_info("Current half saturated, rotating to other half").await;
*next = if current_half_start == 0 { half_size } else { 0 };
} else {
*next = (*next + 1) % pool_size;
}
attempts += 1;
}
drop(next);
// All instances saturated
Err(anyhow!("All instances at task capacity"))
}
pub fn get_reset_controller(&self) -> Arc<HardResetController> {
Arc::clone(&self.hard_reset_controller)
}
/// Check if hard reset threshold has been reached
pub fn should_perform_hard_reset(&self) -> bool {
self.hard_reset_controller.get_count() >= Self::HARD_RESET_ERROR_THRESHOLD
}
/// Get current error count and threshold for monitoring
pub fn get_reset_status(&self) -> (usize, usize) {
(
self.hard_reset_controller.get_count(),
Self::HARD_RESET_ERROR_THRESHOLD
)
}
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
/// ✅ FIXED: Now with proper error propagation and Chrome process cleanup
pub async fn shutdown(&self) -> Result<()> {
logger::log_info(&format!("Shutting down {} ChromeDriver instances...", self.instances.len())).await;
let mut shutdown_errors = Vec::new();
for (i, inst) in self.instances.iter().enumerate() {
logger::log_info(&format!(" Shutting down instance {}...", i)).await;
let mut guard = inst.lock().await;
if let Err(e) = guard.shutdown().await {
logger::log_error(&format!(" ✗ Instance {} shutdown error: {}", i, e)).await;
shutdown_errors.push(format!("Instance {}: {}", i, e));
} else {
logger::log_info(&format!(" ✓ Instance {} shut down", i)).await;
}
}
if let Some(pp) = &self.proxy_pool {
logger::log_info("Shutting down proxy pool...").await;
if let Err(e) = pp.shutdown().await {
logger::log_error(&format!("Proxy pool shutdown error: {}", e)).await;
shutdown_errors.push(format!("Proxy pool: {}", e));
} else {
logger::log_info("✓ Proxy pool shut down").await;
}
}
if !shutdown_errors.is_empty() {
return Err(anyhow!(
"Pool shutdown completed with {} error(s): {}",
shutdown_errors.len(),
shutdown_errors.join("; ")
));
}
logger::log_info("✓ All ChromeDriver instances shut down successfully").await;
Ok(())
}
pub fn get_number_of_instances(&self) -> usize {
self.instances.len()
}
/// Returns whether rotation is enabled
pub fn is_rotation_enabled(&self) -> bool {
self.rotation_enabled
}
/// Returns the size of each half when rotation is enabled
pub fn get_rotation_half_size(&self) -> usize {
if self.rotation_enabled {
(self.instances.len() + 1) / 2
} else {
self.instances.len()
}
}
}
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
pub struct ChromeInstance {
base_url: String,
process: Child,
stderr_log: Option<JoinHandle<()>>,
task_count: usize,
max_tasks_per_instance: usize,
proxy_url: Option<String>,
current_session: Arc<Mutex<Option<Client>>>, // Current active session
session_request_count: Arc<Mutex<usize>>,
max_requests_per_session: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Reference to the proxy pool
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
instance_id: usize,
monitoring: Option<crate::monitoring::MonitoringHandle>,
// ✅ NEW: Track Chrome browser PID for proper cleanup
chrome_pid: Arc<Mutex<Option<u32>>>,
}
impl ChromeInstance {
pub async fn new(proxy_pool: Option<Arc<DockerVpnProxyPool>>, instance_id: usize, config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>) -> Result<Self> {
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
// Get proxy URL if proxy pool is provided
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(instance_id));
let max_tasks_per_instance = config.max_tasks_per_instance;
let max_requests_per_session = config.max_requests_per_session;
Ok(Self {
base_url,
process,
stderr_log: Some(stderr_handle),
task_count: 0,
max_tasks_per_instance,
proxy_url,
current_session: Arc::new(Mutex::new(None)),
session_request_count: Arc::new(Mutex::new(0)),
max_requests_per_session,
proxy_pool,
current_proxy_index: Arc::new(Mutex::new(instance_id)),
instance_id,
monitoring,
chrome_pid: Arc::new(Mutex::new(None)),
})
}
pub async fn get_or_renew_session(&mut self) -> Result<Client> {
let mut session_opt = self.current_session.lock().await;
let mut request_count = self.session_request_count.lock().await;
// Session renewal conditions:
// 1. No session exists
// 2. Request limit reached
let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session;
if needs_renewal {
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
instance_id: self.instance_id,
status: crate::monitoring::InstanceStatusChange::Renewing,
});
}
// ✅ FIXED: Close old session with proper error handling
if let Some(old_session) = session_opt.take() {
crate::util::logger::log_info("Closing old session").await;
// Try to close gracefully first
if let Err(e) = old_session.close().await {
logger::log_warn(&format!("Session close failed (may leave Chrome tabs open): {}", e)).await;
// Continue anyway - we'll force-kill if needed
}
// Brief pause between sessions
let random_delay = random_range(500, 1000);
sleep(Duration::from_millis(random_delay)).await;
}
// Create new session with fresh User-Agent
crate::util::logger::log_info(&format!(
"Creating new session (requests in last session: {})",
*request_count
)).await;
let new_session = self.create_fresh_session().await?;
*session_opt = Some(new_session.clone());
*request_count = 0;
if let Some(ref mon) = self.monitoring {
let reason = if *request_count >= self.max_requests_per_session {
crate::monitoring::RenewalReason::RequestLimit
} else {
crate::monitoring::RenewalReason::TaskLimit
};
// Get updated proxy info
let new_proxy_info = if let Some(ref pp) = self.proxy_pool {
let proxy_idx = *self.current_proxy_index.lock().await;
pp.get_proxy_info(proxy_idx)
} else {
self.proxy_url.as_ref().and_then(|url| {
if let Some(port_str) = url.split(':').last() {
if let Ok(port) = port_str.parse::<u16>() {
return Some(crate::monitoring::ProxyInfo {
container_name: format!("proxy-{}", self.instance_id),
ip_address: "127.0.0.1".to_string(),
port,
status: crate::monitoring::ProxyStatus::Connected,
});
}
}
None
})
};
mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed {
instance_id: self.instance_id,
old_request_count: *request_count,
reason: reason,
new_proxy: new_proxy_info,
});
}
Ok(new_session)
} else {
// Use existing session
*request_count += 1;
Ok(session_opt.as_ref().unwrap().clone())
}
}
async fn create_fresh_session(&self) -> Result<Client> {
let proxy_url = if let Some(ref pool) = self.proxy_pool {
let mut proxy_idx = self.current_proxy_index.lock().await;
let num_proxies = pool.num_proxies();
// Round-robin through all proxies
let selected_proxy = *proxy_idx % num_proxies;
*proxy_idx = (*proxy_idx + 1) % num_proxies;
let url = pool.get_proxy_url(selected_proxy);
logger::log_info(&format!(
"Instance {} creating session with proxy {}/{} (rotation)",
self.instance_id,
selected_proxy,
num_proxies
)).await;
Some(url)
} else {
self.proxy_url.clone()
};
let user_agent = Self::chrome_user_agent();
let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url);
let client = ClientBuilder::native()
.capabilities(capabilities)
.connect(&self.base_url)
.await
.context("Failed to connect to ChromeDriver")?;
// ✅ NEW: Extract and store Chrome PID for cleanup
// Chrome process info can be extracted from session info if needed
// For now, we rely on killing the process tree
Ok(client)
}
pub async fn invalidate_current_session(&self) {
let mut session_opt = self.current_session.lock().await;
if let Some(old_session) = session_opt.take() {
crate::util::logger::log_info(&format!(
"Invalidating broken session for instance {}",
self.instance_id
)).await;
// ✅ FIXED: Proper error handling instead of silent failure
if let Err(e) = old_session.close().await {
logger::log_warn(&format!(
"Failed to close broken session (Chrome tabs may remain): {}",
e
)).await;
}
}
let mut request_count = self.session_request_count.lock().await;
*request_count = 0;
}
pub fn reset_task_count(&mut self) {
self.task_count = 0;
}
pub async fn get_session_stats(&self) -> (usize, usize) {
let request_count = *self.session_request_count.lock().await;
(self.task_count, request_count)
}
pub fn increment_task_count(&mut self) {
self.task_count += 1;
}
pub fn get_task_count(&self) -> usize {
self.task_count
}
/// ✅ FIXED: Proper Chrome + ChromeDriver shutdown with process tree killing
pub async fn shutdown(&mut self) -> Result<()> {
logger::log_info(&format!("Shutting down ChromeInstance {}...", self.instance_id)).await;
// Step 1: Close any active session to signal Chrome to close
{
let mut session_opt = self.current_session.lock().await;
if let Some(session) = session_opt.take() {
logger::log_info(" Closing active session...").await;
if let Err(e) = session.close().await {
logger::log_warn(&format!(" Session close failed: {}", e)).await;
}
}
}
// Step 2: Abort stderr logging task
if let Some(handle) = self.stderr_log.take() {
handle.abort();
let _ = handle.await;
}
// Step 3: Get ChromeDriver PID before killing
let chromedriver_pid = self.process.id();
logger::log_info(&format!(" ChromeDriver PID: {:?}", chromedriver_pid)).await;
// Step 4: Kill ChromeDriver and wait
if let Err(e) = self.process.start_kill() {
logger::log_warn(&format!(" Failed to kill ChromeDriver: {}", e)).await;
}
// Wait for ChromeDriver to exit (with timeout)
match timeout(Duration::from_secs(5), self.process.wait()).await {
Ok(Ok(status)) => {
logger::log_info(&format!(" ChromeDriver exited with status: {:?}", status)).await;
}
Ok(Err(e)) => {
logger::log_warn(&format!(" Error waiting for ChromeDriver: {}", e)).await;
}
Err(_) => {
logger::log_warn(" ChromeDriver didn't exit within 5s").await;
}
}
// Step 5: ✅ CRITICAL FIX: Force-kill Chrome process tree
// On Windows, Chrome doesn't die when ChromeDriver dies
if let Some(pid) = chromedriver_pid {
logger::log_info(&format!(" Force-killing Chrome process tree for PID {}...", pid)).await;
#[cfg(target_os = "windows")]
{
// Kill entire process tree on Windows
let _ = Command::new("taskkill")
.args(["/F", "/T", "/PID", &pid.to_string()])
.output()
.await;
// Also kill any remaining chrome.exe processes
let _ = Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
}
#[cfg(not(target_os = "windows"))]
{
// Kill process group on Unix
let _ = Command::new("pkill")
.args(["-P", &pid.to_string()])
.output()
.await;
}
logger::log_info(" ✓ Chrome process tree killed").await;
}
// Step 6: Wait a moment for processes to fully terminate
sleep(Duration::from_millis(500)).await;
logger::log_info(&format!("✓ ChromeInstance {} shut down", self.instance_id)).await;
Ok(())
}
pub fn is_available(&self) -> bool {
if self.max_tasks_per_instance == 0 {
return true; // No limit
}
self.task_count < self.max_tasks_per_instance
}
pub fn tasks_remaining(&self) -> usize {
if self.max_tasks_per_instance == 0 {
return usize::MAX;
}
self.max_tasks_per_instance.saturating_sub(self.task_count)
}
/// Spawns the actual `chromedriver` binary and waits for it to become ready.
async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> {
let mut process = Command::new("chromedriver-win64/chromedriver.exe")
.arg("--port=0") // let OS choose free port
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.context("Failed to start chromedriver. Is it in PATH?")?;
let stdout = process.stdout.take().unwrap();
let stderr = process.stderr.take().unwrap();
let stdout_reader = BufReader::new(stdout);
let mut stdout_lines = stdout_reader.lines();
let stderr_reader = BufReader::new(stderr);
let stderr_handle = tokio::spawn(async move {
let mut lines = stderr_reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
let t = line.trim();
if !t.is_empty() {
let _ = crate::util::logger::log_info(&format!("ChromeDriver: {}", t)).await;
}
}
});
let start = tokio::time::Instant::now();
let mut address: Option<String> = None;
while start.elapsed() < Duration::from_secs(30) {
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout_lines.next_line()).await {
if let Some(addr) = parse_chromedriver_address(&line) {
address = Some(addr);
}
if line.contains("ChromeDriver was started successfully") && address.is_some() {
return Ok((address.unwrap(), process, stderr_handle));
}
}
sleep(Duration::from_millis(100)).await;
}
let _ = process.kill().await;
stderr_handle.abort();
Err(anyhow!("ChromeDriver failed to start within 30s"))
}
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
let mut args = vec![
"--headless=new".to_string(),
"--disable-gpu".to_string(),
"--no-sandbox".to_string(),
"--disable-dev-shm-usage".to_string(),
"--disable-infobars".to_string(),
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
format!("--user-agent={}", user_agent),
];
if let Some(proxy) = proxy_url {
args.push(format!("--proxy-server={}", proxy));
}
let caps = serde_json::json!({
"goog:chromeOptions": {
"args": args,
"excludeSwitches": ["enable-logging", "enable-automation"],
"prefs": {
"profile.default_content_setting_values.notifications": 2
}
}
});
caps.as_object().cloned().unwrap()
}
pub fn chrome_user_agent() -> &'static str {
static UAS: &[&str] = &[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
];
let random_user_agent = choose_random(UAS);
random_user_agent
}
}
impl Drop for ChromeInstance {
fn drop(&mut self) {
// Signal both ChromeDriver and Chrome to terminate
let _ = self.process.start_kill();
// Also try to kill Chrome if we know the PID
if let Some(pid) = self.process.id() {
#[cfg(target_os = "windows")]
{
// Fire and forget - this is best-effort cleanup
let _ = std::process::Command::new("taskkill")
.args(["/F", "/T", "/PID", &pid.to_string()])
.output();
}
}
}
}
fn parse_chromedriver_address(line: &str) -> Option<String> {
if line.contains("Starting ChromeDriver") {
if let Some(port_str) = line.split("on port ").nth(1) {
if let Some(port) = port_str.split_whitespace().next() {
if port.parse::<u16>().is_ok() {
return Some(format!("http://localhost:{}", port));
}
}
}
}
for word in line.split_whitespace() {
if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::<u16>() {
if port > 1024 && port < 65535 && line.to_lowercase().contains("port") {
return Some(format!("http://localhost:{}", port));
}
}
}
None
}
/// Simplified task execution - uses the pool pattern.
pub struct ScrapeTask<T> {
url: String,
parse: Box<
dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send,
>,
}
impl<T: Send + 'static> ScrapeTask<T> {
pub fn new<F, Fut>(url: String, parse: F) -> Self
where
F: FnOnce(Client) -> Fut + Send + 'static,
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
{
Self {
url,
parse: Box::new(move |client| Box::pin(parse(client))),
}
}
pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
let url = self.url;
let parse = self.parse;
pool.execute(url, move |client| async move { (parse)(client).await })
.await
}
}