1057 lines
40 KiB
Rust
1057 lines
40 KiB
Rust
// src/scraper/webdriver.rs
|
|
use super::helpers::*;
|
|
use super::hard_reset::HardResetController;
|
|
use super::docker_vpn_proxy::DockerVpnProxyPool;
|
|
use crate::Config;
|
|
use crate::logger;
|
|
|
|
use anyhow::{anyhow, Context, Result};
|
|
use fantoccini::{Client, ClientBuilder};
|
|
use serde_json::{Map, Value};
|
|
use std::pin::Pin;
|
|
use std::process::Stdio;
|
|
use std::sync::Arc;
|
|
use std::time::Instant;
|
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
|
use tokio::process::{Child, Command};
|
|
use tokio::task::JoinHandle;
|
|
use tokio::sync::{Mutex, Semaphore};
|
|
use tokio::time::{sleep, timeout, Duration};
|
|
|
|
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
|
|
pub struct ChromeDriverPool {
|
|
instances: Vec<Arc<Mutex<ChromeInstance>>>,
|
|
semaphore: Arc<Semaphore>,
|
|
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
|
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
|
/// Whether rotation is enabled (uses half of instances at a time)
|
|
rotation_enabled: bool,
|
|
/// Index for round-robin instance selection (when rotation is enabled)
|
|
next_instance: Arc<Mutex<usize>>,
|
|
|
|
last_request_time: Arc<Mutex<Instant>>,
|
|
min_request_interval_ms: u64,
|
|
|
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
|
hard_reset_controller: Arc<HardResetController>,
|
|
config: Arc<Config>,
|
|
}
|
|
|
|
impl ChromeDriverPool {
|
|
/// When consecutive errors reach this value, execute() will return a special error
|
|
/// that signals the caller to trigger a hard reset
|
|
const HARD_RESET_ERROR_THRESHOLD: usize = 12;
|
|
|
|
/// Creates a new pool without any proxy (direct connection).
|
|
pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
|
|
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
|
|
}
|
|
|
|
/// Creates a new pool with task-per-instance limit but no proxy.
|
|
pub async fn _new_with_task_limit(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
|
|
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
|
|
}
|
|
|
|
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
|
|
pub async fn new_with_proxy(
|
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
|
config: &Config,
|
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
|
) -> Result<Self> {
|
|
Self::new_with_proxy_and_task_limit(proxy_pool, config, monitoring).await
|
|
}
|
|
|
|
/// Full constructor: supports proxy + task limiting + rotation.
|
|
///
|
|
/// When rotation is enabled, only half of the instances are used at once,
|
|
/// rotating to the other half when task limits are reached.
|
|
///
|
|
/// The actual pool_size is constrained by:
|
|
/// - max_parallel_instances from config (pool_size_limit parameter)
|
|
/// - Available proxies from proxy_pool (if provided)
|
|
///
|
|
/// Uses the minimum of these constraints to determine actual pool size.
|
|
pub async fn new_with_proxy_and_task_limit(
|
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
|
config: &Config,
|
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
|
) -> Result<Self> {
|
|
let pool_size_limit = config.max_parallel_instances;
|
|
let task_per_instance_limit = config.max_tasks_per_instance;
|
|
|
|
// Determine actual pool size based on available resources
|
|
let actual_pool_size = if let Some(ref pp) = proxy_pool {
|
|
let available_proxies = pp.num_proxies();
|
|
pool_size_limit.min(available_proxies)
|
|
} else {
|
|
pool_size_limit
|
|
};
|
|
|
|
if actual_pool_size == 0 {
|
|
return Err(anyhow!("Pool size must be at least 1"));
|
|
}
|
|
|
|
// Rotation is enabled when task limiting is active
|
|
let rotation_enabled = task_per_instance_limit > 0;
|
|
let half_size = if rotation_enabled {
|
|
(actual_pool_size + 1) / 2 // Round up for odd numbers
|
|
} else {
|
|
actual_pool_size
|
|
};
|
|
|
|
let mut instances = Vec::with_capacity(actual_pool_size);
|
|
|
|
crate::util::logger::log_info(&format!(
|
|
"Initializing ChromeDriver pool with {} instances{}{}...",
|
|
actual_pool_size,
|
|
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" },
|
|
if rotation_enabled { " with rotation enabled" } else { "" }
|
|
))
|
|
.await;
|
|
|
|
if rotation_enabled && actual_pool_size < 2 {
|
|
crate::util::logger::log_warn(
|
|
"Rotation enabled but pool has < 2 instances - rotation will be limited"
|
|
).await;
|
|
}
|
|
|
|
for i in 0..actual_pool_size {
|
|
// Pass the entire proxy_pool and the index
|
|
let instance = ChromeInstance::new(
|
|
proxy_pool.clone(),
|
|
i,
|
|
config,
|
|
monitoring.clone(),
|
|
).await?;
|
|
|
|
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
|
instances.push(Arc::new(Mutex::new(instance)));
|
|
}
|
|
|
|
// Emit instance created events
|
|
for (i, instance) in instances.iter().enumerate() {
|
|
if let Some(ref mon) = monitoring {
|
|
let guard = instance.lock().await;
|
|
|
|
// Extract proxy info if available
|
|
let proxy_info = if let Some(ref pp) = proxy_pool {
|
|
pp.get_proxy_info(i % pp.num_proxies())
|
|
} else {
|
|
guard.proxy_url.as_ref().and_then(|url| {
|
|
// Parse proxy URL manually if no pool
|
|
// Format: socks5://localhost:10801
|
|
if let Some(port_str) = url.split(':').last() {
|
|
if let Ok(port) = port_str.parse::<u16>() {
|
|
return Some(crate::monitoring::ProxyInfo {
|
|
container_name: format!("proxy-{}", i),
|
|
ip_address: "127.0.0.1".to_string(),
|
|
port,
|
|
status: crate::monitoring::ProxyStatus::Connected,
|
|
});
|
|
}
|
|
}
|
|
None
|
|
})
|
|
};
|
|
|
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated {
|
|
instance_id: i,
|
|
max_tasks: guard.max_tasks_per_instance,
|
|
proxy: proxy_info.clone(),
|
|
});
|
|
|
|
// Also emit ProxyConnected event if proxy exists
|
|
if let Some(ref proxy) = proxy_info {
|
|
mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected {
|
|
container_name: proxy.container_name.clone(),
|
|
ip_address: proxy.ip_address.clone(),
|
|
port: proxy.port,
|
|
});
|
|
}
|
|
|
|
drop(guard);
|
|
}
|
|
}
|
|
|
|
let min_request_interval_ms = config.min_request_interval_ms;
|
|
|
|
let hard_reset_controller = Arc::new(HardResetController::new());
|
|
|
|
let config_clone = Arc::new(config.clone());
|
|
|
|
Ok(Self {
|
|
instances,
|
|
semaphore: Arc::new(Semaphore::new(half_size)),
|
|
proxy_pool,
|
|
rotation_enabled,
|
|
next_instance: Arc::new(Mutex::new(0)),
|
|
last_request_time: Arc::new(Mutex::new(Instant::now())),
|
|
min_request_interval_ms,
|
|
monitoring,
|
|
hard_reset_controller,
|
|
config: config_clone,
|
|
})
|
|
}
|
|
|
|
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
|
where
|
|
T: Send + 'static,
|
|
F: FnOnce(Client) -> Fut + Send + 'static,
|
|
Fut: std::future::Future<Output = Result<T>> + Send,
|
|
{
|
|
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
|
|
|
{
|
|
let mut last_time = self.last_request_time.lock().await;
|
|
let elapsed = last_time.elapsed().as_millis() as u64;
|
|
|
|
if elapsed < self.min_request_interval_ms {
|
|
let wait_ms = self.min_request_interval_ms - elapsed;
|
|
drop(last_time);
|
|
sleep(Duration::from_millis(wait_ms)).await;
|
|
let mut last_time = self.last_request_time.lock().await;
|
|
*last_time = Instant::now();
|
|
} else {
|
|
*last_time = Instant::now();
|
|
}
|
|
}
|
|
|
|
let instance = if self.rotation_enabled {
|
|
self.select_instance_with_rotation().await?
|
|
} else {
|
|
self.select_instance_round_robin().await
|
|
};
|
|
|
|
{
|
|
let mut inst = instance.lock().await;
|
|
inst.increment_task_count();
|
|
}
|
|
|
|
let index: usize = {
|
|
let instances = &self.instances;
|
|
instances.iter().position(|inst| Arc::ptr_eq(inst, &instance)).unwrap_or(0)
|
|
};
|
|
|
|
if let Some(ref mon) = self.monitoring {
|
|
mon.emit(crate::monitoring::MonitoringEvent::TaskStarted {
|
|
instance_id: index,
|
|
url: url.clone(),
|
|
});
|
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
|
instance_id: index,
|
|
status: crate::monitoring::InstanceStatusChange::Active,
|
|
});
|
|
};
|
|
|
|
let mut guard = instance.lock().await;
|
|
let client = guard.get_or_renew_session().await?;
|
|
let (task_count, session_requests) = guard.get_session_stats().await;
|
|
|
|
crate::util::logger::log_info(&format!(
|
|
"Instance {} executing task (tasks: {}/{}, session requests: {})",
|
|
index, task_count, guard.max_tasks_per_instance, session_requests
|
|
)).await;
|
|
|
|
drop(guard);
|
|
|
|
let start_time = Instant::now();
|
|
|
|
// Navigation with timeout
|
|
let navigation_result = timeout(
|
|
Duration::from_secs(60),
|
|
client.goto(&url)
|
|
).await;
|
|
|
|
let result = match navigation_result {
|
|
Ok(Ok(_)) => {
|
|
if let Some(ref mon) = self.monitoring {
|
|
mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted {
|
|
instance_id: index,
|
|
success: navigation_result.is_ok(),
|
|
duration_ms: start_time.elapsed().as_millis() as u64,
|
|
error: navigation_result.as_ref().err().map(|e| e.to_string()),
|
|
});
|
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
|
instance_id: index,
|
|
status: crate::monitoring::InstanceStatusChange::Idle,
|
|
});
|
|
}
|
|
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
|
|
|
|
// Execute parse function
|
|
match parse(client).await {
|
|
Ok(data) => {
|
|
// ✅ SUCCESS: Record and log
|
|
let prev_count = self.hard_reset_controller.get_count();
|
|
self.hard_reset_controller.record_success();
|
|
|
|
if prev_count > 0 {
|
|
logger::log_info(&format!(
|
|
"✓ Success - reset counter cleared (was: {}/{})",
|
|
prev_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
)).await;
|
|
}
|
|
|
|
Ok(data)
|
|
}
|
|
Err(e) => {
|
|
// ❌ PARSE ERROR: Record, check threshold, invalidate session
|
|
let error_count = self.hard_reset_controller.record_error();
|
|
|
|
{
|
|
let mut inst = instance.lock().await;
|
|
inst.invalidate_current_session().await;
|
|
}
|
|
|
|
// Enhanced logging with threshold status
|
|
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
|
|
logger::log_warn(&format!(
|
|
"Parse error. Reset counter: {}/{} ({:.0}%)",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD,
|
|
threshold_pct
|
|
)).await;
|
|
|
|
// Check if threshold reached
|
|
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
|
|
logger::log_error(&format!(
|
|
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
)).await;
|
|
|
|
return Err(anyhow!(
|
|
"HARD_RESET_REQUIRED: Parse failed: {}. Threshold reached ({}/{})",
|
|
e,
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
));
|
|
}
|
|
|
|
Err(anyhow!(
|
|
"Parse failed: {}. Hard reset at {}/{}",
|
|
e,
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
))
|
|
}
|
|
}
|
|
}
|
|
Ok(Err(e)) => {
|
|
// ❌ NAVIGATION ERROR: Record, check threshold, invalidate session
|
|
crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await;
|
|
|
|
{
|
|
let mut inst = instance.lock().await;
|
|
inst.invalidate_current_session().await;
|
|
}
|
|
|
|
let error_count = self.hard_reset_controller.record_error();
|
|
|
|
// Enhanced logging
|
|
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
|
|
logger::log_warn(&format!(
|
|
"Navigation error. Reset counter: {}/{} ({:.0}%)",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD,
|
|
threshold_pct
|
|
)).await;
|
|
|
|
// Check if threshold reached
|
|
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
|
|
logger::log_error(&format!(
|
|
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
)).await;
|
|
|
|
return Err(anyhow!(
|
|
"HARD_RESET_REQUIRED: Navigation failed: {}. Threshold reached ({}/{})",
|
|
e,
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
));
|
|
}
|
|
|
|
Err(anyhow!(
|
|
"Navigation failed: {}. Hard reset at {}/{}",
|
|
e,
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
))
|
|
}
|
|
Err(_) => {
|
|
// ❌ TIMEOUT ERROR: Record, check threshold, invalidate session
|
|
if let Some(ref mon) = self.monitoring {
|
|
mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout {
|
|
instance_id: index,
|
|
url: url.clone(),
|
|
});
|
|
}
|
|
|
|
let error_count = self.hard_reset_controller.record_error();
|
|
|
|
crate::util::logger::log_error("Navigation timeout (60s)").await;
|
|
|
|
{
|
|
let mut inst = instance.lock().await;
|
|
inst.invalidate_current_session().await;
|
|
}
|
|
|
|
// Enhanced logging
|
|
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
|
|
logger::log_warn(&format!(
|
|
"Timeout error. Reset counter: {}/{} ({:.0}%)",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD,
|
|
threshold_pct
|
|
)).await;
|
|
|
|
// Check if threshold reached
|
|
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
|
|
logger::log_error(&format!(
|
|
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
)).await;
|
|
|
|
return Err(anyhow!(
|
|
"HARD_RESET_REQUIRED: Navigation timeout. Threshold reached ({}/{})",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
));
|
|
}
|
|
|
|
Err(anyhow!(
|
|
"Navigation timeout. Hard reset at {}/{}",
|
|
error_count,
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
))
|
|
}
|
|
};
|
|
|
|
{
|
|
let mut inst = instance.lock().await;
|
|
inst.task_count = inst.task_count.saturating_sub(1);
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Simple round-robin instance selection (no rotation)
|
|
async fn select_instance_round_robin(&self) -> Arc<Mutex<ChromeInstance>> {
|
|
let mut next = self.next_instance.lock().await;
|
|
let index = *next;
|
|
*next = (*next + 1) % self.instances.len();
|
|
drop(next);
|
|
|
|
Arc::clone(&self.instances[index])
|
|
}
|
|
|
|
/// Round-robin with half-pool rotation
|
|
async fn select_instance_with_rotation(&self) -> Result<Arc<Mutex<ChromeInstance>>> {
|
|
let pool_size = self.instances.len();
|
|
let half_size = pool_size / 2;
|
|
|
|
if half_size == 0 {
|
|
// Pool too small for rotation, fall back to simple round-robin
|
|
return Ok(self.select_instance_round_robin().await);
|
|
}
|
|
|
|
let mut next = self.next_instance.lock().await;
|
|
let current_half_start = (*next / half_size) * half_size;
|
|
let current_half_end = (current_half_start + half_size).min(pool_size);
|
|
|
|
// Try to find available instance in current half
|
|
let mut attempts = 0;
|
|
let max_attempts = half_size * 2; // Try both halves
|
|
|
|
while attempts < max_attempts {
|
|
let index = current_half_start + (*next % half_size);
|
|
let instance = &self.instances[index];
|
|
|
|
// Check if instance can accept more tasks
|
|
let mut inst = instance.lock().await;
|
|
let can_accept = inst.get_task_count() < inst.max_tasks_per_instance;
|
|
drop(inst);
|
|
|
|
if can_accept {
|
|
*next = (*next + 1) % pool_size;
|
|
drop(next);
|
|
|
|
if let Some(ref mon) = self.monitoring {
|
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceSelected {
|
|
instance_id: index,
|
|
half: if index < half_size { 1 } else { 2 },
|
|
});
|
|
}
|
|
|
|
return Ok(Arc::clone(instance));
|
|
}
|
|
|
|
// Current half saturated, try other half
|
|
if attempts == half_size - 1 {
|
|
logger::log_info("Current half saturated, rotating to other half").await;
|
|
*next = if current_half_start == 0 { half_size } else { 0 };
|
|
} else {
|
|
*next = (*next + 1) % pool_size;
|
|
}
|
|
|
|
attempts += 1;
|
|
}
|
|
|
|
drop(next);
|
|
|
|
// All instances saturated
|
|
Err(anyhow!("All instances at task capacity"))
|
|
}
|
|
|
|
pub fn get_reset_controller(&self) -> Arc<HardResetController> {
|
|
Arc::clone(&self.hard_reset_controller)
|
|
}
|
|
|
|
/// Check if hard reset threshold has been reached
|
|
pub fn should_perform_hard_reset(&self) -> bool {
|
|
self.hard_reset_controller.get_count() >= Self::HARD_RESET_ERROR_THRESHOLD
|
|
}
|
|
|
|
/// Get current error count and threshold for monitoring
|
|
pub fn get_reset_status(&self) -> (usize, usize) {
|
|
(
|
|
self.hard_reset_controller.get_count(),
|
|
Self::HARD_RESET_ERROR_THRESHOLD
|
|
)
|
|
}
|
|
|
|
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
|
/// ✅ FIXED: Now with proper error propagation and Chrome process cleanup
|
|
pub async fn shutdown(&self) -> Result<()> {
|
|
logger::log_info(&format!("Shutting down {} ChromeDriver instances...", self.instances.len())).await;
|
|
|
|
let mut shutdown_errors = Vec::new();
|
|
|
|
for (i, inst) in self.instances.iter().enumerate() {
|
|
logger::log_info(&format!(" Shutting down instance {}...", i)).await;
|
|
|
|
let mut guard = inst.lock().await;
|
|
if let Err(e) = guard.shutdown().await {
|
|
logger::log_error(&format!(" ✗ Instance {} shutdown error: {}", i, e)).await;
|
|
shutdown_errors.push(format!("Instance {}: {}", i, e));
|
|
} else {
|
|
logger::log_info(&format!(" ✓ Instance {} shut down", i)).await;
|
|
}
|
|
}
|
|
|
|
if let Some(pp) = &self.proxy_pool {
|
|
logger::log_info("Shutting down proxy pool...").await;
|
|
if let Err(e) = pp.shutdown().await {
|
|
logger::log_error(&format!("Proxy pool shutdown error: {}", e)).await;
|
|
shutdown_errors.push(format!("Proxy pool: {}", e));
|
|
} else {
|
|
logger::log_info("✓ Proxy pool shut down").await;
|
|
}
|
|
}
|
|
|
|
if !shutdown_errors.is_empty() {
|
|
return Err(anyhow!(
|
|
"Pool shutdown completed with {} error(s): {}",
|
|
shutdown_errors.len(),
|
|
shutdown_errors.join("; ")
|
|
));
|
|
}
|
|
|
|
logger::log_info("✓ All ChromeDriver instances shut down successfully").await;
|
|
Ok(())
|
|
}
|
|
|
|
pub fn get_number_of_instances(&self) -> usize {
|
|
self.instances.len()
|
|
}
|
|
|
|
/// Returns whether rotation is enabled
|
|
pub fn is_rotation_enabled(&self) -> bool {
|
|
self.rotation_enabled
|
|
}
|
|
|
|
/// Returns the size of each half when rotation is enabled
|
|
pub fn get_rotation_half_size(&self) -> usize {
|
|
if self.rotation_enabled {
|
|
(self.instances.len() + 1) / 2
|
|
} else {
|
|
self.instances.len()
|
|
}
|
|
}
|
|
pub fn get_proxy_pool(&self) -> Option<Arc<DockerVpnProxyPool>> {
|
|
self.proxy_pool.clone()
|
|
}
|
|
}
|
|
|
|
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
|
|
pub struct ChromeInstance {
|
|
base_url: String,
|
|
process: Child,
|
|
stderr_log: Option<JoinHandle<()>>,
|
|
task_count: usize,
|
|
max_tasks_per_instance: usize,
|
|
proxy_url: Option<String>,
|
|
|
|
current_session: Arc<Mutex<Option<Client>>>, // Current active session
|
|
session_request_count: Arc<Mutex<usize>>,
|
|
max_requests_per_session: usize,
|
|
|
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Reference to the proxy pool
|
|
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
|
|
|
|
instance_id: usize,
|
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
|
}
|
|
|
|
impl ChromeInstance {
|
|
pub async fn new(proxy_pool: Option<Arc<DockerVpnProxyPool>>, instance_id: usize, config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>) -> Result<Self> {
|
|
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
|
|
|
// Get proxy URL if proxy pool is provided
|
|
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(instance_id));
|
|
|
|
let max_tasks_per_instance = config.max_tasks_per_instance;
|
|
let max_requests_per_session = config.max_requests_per_session;
|
|
|
|
Ok(Self {
|
|
base_url,
|
|
process,
|
|
stderr_log: Some(stderr_handle),
|
|
task_count: 0,
|
|
max_tasks_per_instance,
|
|
proxy_url,
|
|
|
|
current_session: Arc::new(Mutex::new(None)),
|
|
session_request_count: Arc::new(Mutex::new(0)),
|
|
max_requests_per_session,
|
|
|
|
proxy_pool,
|
|
current_proxy_index: Arc::new(Mutex::new(instance_id)),
|
|
|
|
instance_id,
|
|
monitoring,
|
|
})
|
|
}
|
|
|
|
pub async fn get_or_renew_session(&mut self) -> Result<Client> {
|
|
let mut session_opt = self.current_session.lock().await;
|
|
let mut request_count = self.session_request_count.lock().await;
|
|
|
|
// Session renewal conditions:
|
|
// 1. No session exists
|
|
// 2. Request limit reached
|
|
let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session;
|
|
|
|
if needs_renewal {
|
|
if let Some(ref mon) = self.monitoring {
|
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
|
instance_id: self.instance_id,
|
|
status: crate::monitoring::InstanceStatusChange::Renewing,
|
|
});
|
|
}
|
|
|
|
// ✅ FIXED: Close old session with proper error handling
|
|
if let Some(old_session) = session_opt.take() {
|
|
crate::util::logger::log_info("Closing old session").await;
|
|
|
|
// Try to close gracefully first
|
|
if let Err(e) = old_session.close().await {
|
|
logger::log_warn(&format!("Session close failed (may leave Chrome tabs open): {}", e)).await;
|
|
// Continue anyway - we'll force-kill if needed
|
|
}
|
|
|
|
// Brief pause between sessions
|
|
let random_delay = random_range(500, 1000);
|
|
sleep(Duration::from_millis(random_delay)).await;
|
|
}
|
|
|
|
// Create new session with fresh User-Agent
|
|
crate::util::logger::log_info(&format!(
|
|
"Creating new session (requests in last session: {})",
|
|
*request_count
|
|
)).await;
|
|
|
|
let new_session = self.create_fresh_session().await?;
|
|
*session_opt = Some(new_session.clone());
|
|
*request_count = 0;
|
|
|
|
if let Some(ref mon) = self.monitoring {
|
|
let reason = if *request_count >= self.max_requests_per_session {
|
|
crate::monitoring::RenewalReason::RequestLimit
|
|
} else {
|
|
crate::monitoring::RenewalReason::TaskLimit
|
|
};
|
|
|
|
// Get updated proxy info
|
|
let new_proxy_info = if let Some(ref pp) = self.proxy_pool {
|
|
let proxy_idx = *self.current_proxy_index.lock().await;
|
|
pp.get_proxy_info(proxy_idx)
|
|
} else {
|
|
self.proxy_url.as_ref().and_then(|url| {
|
|
if let Some(port_str) = url.split(':').last() {
|
|
if let Ok(port) = port_str.parse::<u16>() {
|
|
return Some(crate::monitoring::ProxyInfo {
|
|
container_name: format!("proxy-{}", self.instance_id),
|
|
ip_address: "127.0.0.1".to_string(),
|
|
port,
|
|
status: crate::monitoring::ProxyStatus::Connected,
|
|
});
|
|
}
|
|
}
|
|
None
|
|
})
|
|
};
|
|
|
|
mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed {
|
|
instance_id: self.instance_id,
|
|
old_request_count: *request_count,
|
|
reason: reason,
|
|
new_proxy: new_proxy_info,
|
|
});
|
|
}
|
|
|
|
Ok(new_session)
|
|
} else {
|
|
// Use existing session
|
|
*request_count += 1;
|
|
Ok(session_opt.as_ref().unwrap().clone())
|
|
}
|
|
}
|
|
|
|
async fn create_fresh_session(&self) -> Result<Client> {
|
|
let proxy_url = if let Some(ref pool) = self.proxy_pool {
|
|
let mut proxy_idx = self.current_proxy_index.lock().await;
|
|
let num_proxies = pool.num_proxies();
|
|
|
|
// Round-robin through all proxies
|
|
let selected_proxy = *proxy_idx % num_proxies;
|
|
*proxy_idx = (*proxy_idx + 1) % num_proxies;
|
|
|
|
let url = pool.get_proxy_url(selected_proxy);
|
|
|
|
logger::log_info(&format!(
|
|
"Instance {} creating session with proxy {}/{} (rotation)",
|
|
self.instance_id,
|
|
selected_proxy,
|
|
num_proxies
|
|
)).await;
|
|
|
|
Some(url)
|
|
} else {
|
|
self.proxy_url.clone()
|
|
};
|
|
|
|
let user_agent = Self::chrome_user_agent();
|
|
let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url);
|
|
|
|
let client = ClientBuilder::native()
|
|
.capabilities(capabilities)
|
|
.connect(&self.base_url)
|
|
.await
|
|
.context("Failed to connect to ChromeDriver")?;
|
|
|
|
// ✅ NEW: Extract and store Chrome PID for cleanup
|
|
// Chrome process info can be extracted from session info if needed
|
|
// For now, we rely on killing the process tree
|
|
|
|
Ok(client)
|
|
}
|
|
|
|
pub async fn invalidate_current_session(&self) {
|
|
let mut session_opt = self.current_session.lock().await;
|
|
|
|
if let Some(old_session) = session_opt.take() {
|
|
crate::util::logger::log_info(&format!(
|
|
"Invalidating broken session for instance {}",
|
|
self.instance_id
|
|
)).await;
|
|
|
|
// ✅ FIXED: Proper error handling instead of silent failure
|
|
if let Err(e) = old_session.close().await {
|
|
logger::log_warn(&format!(
|
|
"Failed to close broken session (Chrome tabs may remain): {}",
|
|
e
|
|
)).await;
|
|
}
|
|
}
|
|
|
|
let mut request_count = self.session_request_count.lock().await;
|
|
*request_count = 0;
|
|
}
|
|
|
|
pub fn reset_task_count(&mut self) {
|
|
self.task_count = 0;
|
|
}
|
|
|
|
pub async fn get_session_stats(&self) -> (usize, usize) {
|
|
let request_count = *self.session_request_count.lock().await;
|
|
(self.task_count, request_count)
|
|
}
|
|
|
|
pub fn increment_task_count(&mut self) {
|
|
self.task_count += 1;
|
|
}
|
|
|
|
pub fn get_task_count(&self) -> usize {
|
|
self.task_count
|
|
}
|
|
|
|
/// ✅ FIXED: Proper Chrome + ChromeDriver shutdown with process tree killing
|
|
pub async fn shutdown(&mut self) -> Result<()> {
|
|
logger::log_info(&format!("Shutting down ChromeInstance {}...", self.instance_id)).await;
|
|
|
|
// Step 1: Close any active session to signal Chrome to close
|
|
{
|
|
let mut session_opt = self.current_session.lock().await;
|
|
if let Some(session) = session_opt.take() {
|
|
logger::log_info(" Closing active session...").await;
|
|
if let Err(e) = session.close().await {
|
|
logger::log_warn(&format!(" Session close failed: {}", e)).await;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 2: Abort stderr logging task
|
|
if let Some(handle) = self.stderr_log.take() {
|
|
handle.abort();
|
|
let _ = handle.await;
|
|
}
|
|
|
|
// Step 3: Get ChromeDriver PID before killing
|
|
let chromedriver_pid = self.process.id();
|
|
|
|
logger::log_info(&format!(" ChromeDriver PID: {:?}", chromedriver_pid)).await;
|
|
|
|
// Step 4: Kill ChromeDriver and wait
|
|
if let Err(e) = self.process.start_kill() {
|
|
logger::log_warn(&format!(" Failed to kill ChromeDriver: {}", e)).await;
|
|
}
|
|
|
|
// Wait for ChromeDriver to exit (with timeout)
|
|
match timeout(Duration::from_secs(5), self.process.wait()).await {
|
|
Ok(Ok(status)) => {
|
|
logger::log_info(&format!(" ChromeDriver exited with status: {:?}", status)).await;
|
|
}
|
|
Ok(Err(e)) => {
|
|
logger::log_warn(&format!(" Error waiting for ChromeDriver: {}", e)).await;
|
|
}
|
|
Err(_) => {
|
|
logger::log_warn(" ChromeDriver didn't exit within 5s").await;
|
|
}
|
|
}
|
|
|
|
// Step 5: ✅ CRITICAL FIX: Force-kill Chrome process tree
|
|
// On Windows, Chrome doesn't die when ChromeDriver dies
|
|
if let Some(pid) = chromedriver_pid {
|
|
logger::log_info(&format!(" Force-killing Chrome process tree for PID {}...", pid)).await;
|
|
|
|
#[cfg(target_os = "windows")]
|
|
{
|
|
// Kill entire process tree on Windows
|
|
let _ = Command::new("taskkill")
|
|
.args(["/F", "/T", "/PID", &pid.to_string()])
|
|
.output()
|
|
.await;
|
|
|
|
// Also kill any remaining chrome.exe processes
|
|
let _ = Command::new("taskkill")
|
|
.args(["/F", "/IM", "chrome.exe"])
|
|
.output()
|
|
.await;
|
|
}
|
|
|
|
#[cfg(not(target_os = "windows"))]
|
|
{
|
|
// Kill process group on Unix
|
|
let _ = Command::new("pkill")
|
|
.args(["-P", &pid.to_string()])
|
|
.output()
|
|
.await;
|
|
}
|
|
|
|
logger::log_info(" ✓ Chrome process tree killed").await;
|
|
}
|
|
|
|
// Step 6: Wait a moment for processes to fully terminate
|
|
sleep(Duration::from_millis(500)).await;
|
|
|
|
logger::log_info(&format!("✓ ChromeInstance {} shut down", self.instance_id)).await;
|
|
Ok(())
|
|
}
|
|
|
|
pub fn is_available(&self) -> bool {
|
|
if self.max_tasks_per_instance == 0 {
|
|
return true; // No limit
|
|
}
|
|
self.task_count < self.max_tasks_per_instance
|
|
}
|
|
|
|
pub fn tasks_remaining(&self) -> usize {
|
|
if self.max_tasks_per_instance == 0 {
|
|
return usize::MAX;
|
|
}
|
|
self.max_tasks_per_instance.saturating_sub(self.task_count)
|
|
}
|
|
|
|
/// Spawns the actual `chromedriver` binary and waits for it to become ready.
|
|
async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> {
|
|
let mut process = Command::new("chromedriver-win64/chromedriver.exe")
|
|
.arg("--port=0") // let OS choose free port
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped())
|
|
.spawn()
|
|
.context("Failed to start chromedriver. Is it in PATH?")?;
|
|
|
|
let stdout = process.stdout.take().unwrap();
|
|
let stderr = process.stderr.take().unwrap();
|
|
|
|
let stdout_reader = BufReader::new(stdout);
|
|
let mut stdout_lines = stdout_reader.lines();
|
|
|
|
let stderr_reader = BufReader::new(stderr);
|
|
let stderr_handle = tokio::spawn(async move {
|
|
let mut lines = stderr_reader.lines();
|
|
while let Ok(Some(line)) = lines.next_line().await {
|
|
let t = line.trim();
|
|
if !t.is_empty() {
|
|
let _ = crate::util::logger::log_info(&format!("ChromeDriver: {}", t)).await;
|
|
}
|
|
}
|
|
});
|
|
|
|
let start = tokio::time::Instant::now();
|
|
let mut address: Option<String> = None;
|
|
|
|
while start.elapsed() < Duration::from_secs(30) {
|
|
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout_lines.next_line()).await {
|
|
if let Some(addr) = parse_chromedriver_address(&line) {
|
|
address = Some(addr);
|
|
}
|
|
if line.contains("ChromeDriver was started successfully") && address.is_some() {
|
|
return Ok((address.unwrap(), process, stderr_handle));
|
|
}
|
|
}
|
|
sleep(Duration::from_millis(100)).await;
|
|
}
|
|
|
|
let _ = process.kill().await;
|
|
stderr_handle.abort();
|
|
Err(anyhow!("ChromeDriver failed to start within 30s"))
|
|
}
|
|
|
|
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
|
|
let mut args = vec![
|
|
"--headless=new".to_string(),
|
|
"--disable-gpu".to_string(),
|
|
"--no-sandbox".to_string(),
|
|
"--disable-dev-shm-usage".to_string(),
|
|
"--disable-infobars".to_string(),
|
|
"--disable-extensions".to_string(),
|
|
"--disable-popup-blocking".to_string(),
|
|
"--disable-notifications".to_string(),
|
|
"--disable-autofill".to_string(),
|
|
"--disable-sync".to_string(),
|
|
"--disable-default-apps".to_string(),
|
|
"--disable-translate".to_string(),
|
|
"--disable-blink-features=AutomationControlled".to_string(),
|
|
format!("--user-agent={}", user_agent),
|
|
];
|
|
|
|
if let Some(proxy) = proxy_url {
|
|
args.push(format!("--proxy-server={}", proxy));
|
|
}
|
|
|
|
let caps = serde_json::json!({
|
|
"goog:chromeOptions": {
|
|
"args": args,
|
|
"excludeSwitches": ["enable-logging", "enable-automation"],
|
|
"prefs": {
|
|
"profile.default_content_setting_values.notifications": 2
|
|
}
|
|
}
|
|
});
|
|
caps.as_object().cloned().unwrap()
|
|
}
|
|
|
|
pub fn chrome_user_agent() -> &'static str {
|
|
static UAS: &[&str] = &[
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
|
|
];
|
|
let random_user_agent = choose_random(UAS);
|
|
random_user_agent
|
|
}
|
|
}
|
|
|
|
impl Drop for ChromeInstance {
|
|
fn drop(&mut self) {
|
|
// Signal both ChromeDriver and Chrome to terminate
|
|
let _ = self.process.start_kill();
|
|
|
|
// Also try to kill Chrome if we know the PID
|
|
if let Some(pid) = self.process.id() {
|
|
#[cfg(target_os = "windows")]
|
|
{
|
|
// Fire and forget - this is best-effort cleanup
|
|
let _ = std::process::Command::new("taskkill")
|
|
.args(["/F", "/T", "/PID", &pid.to_string()])
|
|
.output();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn parse_chromedriver_address(line: &str) -> Option<String> {
|
|
if line.contains("Starting ChromeDriver") {
|
|
if let Some(port_str) = line.split("on port ").nth(1) {
|
|
if let Some(port) = port_str.split_whitespace().next() {
|
|
if port.parse::<u16>().is_ok() {
|
|
return Some(format!("http://localhost:{}", port));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for word in line.split_whitespace() {
|
|
if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::<u16>() {
|
|
if port > 1024 && port < 65535 && line.to_lowercase().contains("port") {
|
|
return Some(format!("http://localhost:{}", port));
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Simplified task execution - uses the pool pattern.
|
|
pub struct ScrapeTask<T> {
|
|
url: String,
|
|
parse: Box<
|
|
dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send,
|
|
>,
|
|
}
|
|
|
|
impl<T: Send + 'static> ScrapeTask<T> {
|
|
pub fn new<F, Fut>(url: String, parse: F) -> Self
|
|
where
|
|
F: FnOnce(Client) -> Fut + Send + 'static,
|
|
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
|
{
|
|
Self {
|
|
url,
|
|
parse: Box::new(move |client| Box::pin(parse(client))),
|
|
}
|
|
}
|
|
|
|
pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
|
|
let url = self.url;
|
|
let parse = self.parse;
|
|
|
|
pool.execute(url, move |client| async move { (parse)(client).await })
|
|
.await
|
|
}
|
|
} |