added hard reset for navigation timeout after 3 hours

This commit is contained in:
2025-12-22 00:31:28 +01:00
parent c01b47000f
commit fb0876309f
12 changed files with 1036 additions and 264 deletions

View File

@@ -1,5 +1,9 @@
// src/scraper/webdriver.rs
use super::helpers::*;
use super::hard_reset::HardResetController;
use super::docker_vpn_proxy::DockerVpnProxyPool;
use crate::Config;
use crate::logger;
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, ClientBuilder};
@@ -13,8 +17,6 @@ use tokio::process::{Child, Command};
use tokio::task::JoinHandle;
use tokio::sync::{Mutex, Semaphore};
use tokio::time::{sleep, timeout, Duration};
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
use crate::Config;
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
pub struct ChromeDriverPool {
@@ -31,10 +33,16 @@ pub struct ChromeDriverPool {
min_request_interval_ms: u64,
monitoring: Option<crate::monitoring::MonitoringHandle>,
hard_reset_controller: Arc<HardResetController>,
config: Arc<Config>,
}
impl ChromeDriverPool {
/// Creates a new pool without any proxy (direct connection).
/// When consecutive errors reach this value, execute() will return a special error
/// that signals the caller to trigger a hard reset
const HARD_RESET_ERROR_THRESHOLD: usize = 12;
/// Creates a new pool without any proxy (direct connection).
pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
}
@@ -85,6 +93,11 @@ impl ChromeDriverPool {
// Rotation is enabled when task limiting is active
let rotation_enabled = task_per_instance_limit > 0;
let half_size = if rotation_enabled {
(actual_pool_size + 1) / 2 // Runde auf bei ungerader Zahl
} else {
actual_pool_size
};
let mut instances = Vec::with_capacity(actual_pool_size);
@@ -105,8 +118,8 @@ impl ChromeDriverPool {
for i in 0..actual_pool_size {
// Pass the entire proxy_pool and the index
let instance = ChromeInstance::new(
proxy_pool.clone(), // Clone the Arc
i, // This instance's proxy index
proxy_pool.clone(),
i,
config,
monitoring.clone(),
).await?;
@@ -162,15 +175,21 @@ impl ChromeDriverPool {
let min_request_interval_ms = config.min_request_interval_ms;
let hard_reset_controller = Arc::new(HardResetController::new());
let config_clone = Arc::new(config.clone());
Ok(Self {
instances,
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
semaphore: Arc::new(Semaphore::new(half_size)),
proxy_pool,
rotation_enabled,
next_instance: Arc::new(Mutex::new(0)),
last_request_time: Arc::new(Mutex::new(Instant::now())),
min_request_interval_ms,
monitoring,
hard_reset_controller,
config: config_clone,
})
}
@@ -188,10 +207,8 @@ impl ChromeDriverPool {
if elapsed < self.min_request_interval_ms {
let wait_ms = self.min_request_interval_ms - elapsed;
drop(last_time); // Lock vor Sleep freigeben!
drop(last_time);
sleep(Duration::from_millis(wait_ms)).await;
let mut last_time = self.last_request_time.lock().await;
*last_time = Instant::now();
} else {
@@ -199,12 +216,20 @@ impl ChromeDriverPool {
}
}
let random_index = random_range(0, self.instances.len() as u64) as usize;
// Index-Auswahl (vereinfacht, siehe unten für vollständige Rotation)
let index = if self.rotation_enabled {
self.get_rotated_index().await?
let instance = if self.rotation_enabled {
self.select_instance_with_rotation().await?
} else {
random_index
self.select_instance_round_robin().await
};
{
let mut inst = instance.lock().await;
inst.increment_task_count();
}
let index: usize = {
let instances = &self.instances;
instances.iter().position(|inst| Arc::ptr_eq(inst, &instance)).unwrap_or(0)
};
if let Some(ref mon) = self.monitoring {
@@ -216,15 +241,10 @@ impl ChromeDriverPool {
instance_id: index,
status: crate::monitoring::InstanceStatusChange::Active,
});
}
};
let instance = &self.instances[index];
let mut guard = instance.lock().await;
// NEU: Session mit automatischer Erneuerung holen!
let client = guard.get_or_renew_session().await?;
guard.increment_task_count();
let (task_count, session_requests) = guard.get_session_stats().await;
crate::util::logger::log_info(&format!(
@@ -232,17 +252,17 @@ impl ChromeDriverPool {
index, task_count, guard.max_tasks_per_instance, session_requests
)).await;
drop(guard); // Lock freigeben vor Navigation
drop(guard);
let start_time = Instant::now();
// Navigation mit Timeout
// Navigation with timeout
let navigation_result = timeout(
Duration::from_secs(60),
client.goto(&url)
).await;
match navigation_result {
let result = match navigation_result {
Ok(Ok(_)) => {
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted {
@@ -258,14 +278,111 @@ impl ChromeDriverPool {
}
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
// Parse-Funktion ausführen
parse(client).await
// Execute parse function
match parse(client).await {
Ok(data) => {
// ✅ SUCCESS: Record and log
let prev_count = self.hard_reset_controller.get_count();
self.hard_reset_controller.record_success();
if prev_count > 0 {
logger::log_info(&format!(
"✓ Success - reset counter cleared (was: {}/{})",
prev_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
}
Ok(data)
}
Err(e) => {
// ❌ PARSE ERROR: Record, check threshold, invalidate session
let error_count = self.hard_reset_controller.record_error();
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
// Enhanced logging with threshold status
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Parse error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Parse failed: {}. Threshold reached ({}/{})",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Parse failed: {}. Hard reset at {}/{}",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
}
}
}
Ok(Err(e)) => {
// ❌ NAVIGATION ERROR: Record, check threshold, invalidate session
crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await;
Err(anyhow!("Navigation failed: {}", e))
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
let error_count = self.hard_reset_controller.record_error();
// Enhanced logging
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Navigation error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Navigation failed: {}. Threshold reached ({}/{})",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Navigation failed: {}. Hard reset at {}/{}",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
}
Err(_) => {
// ❌ TIMEOUT ERROR: Record, check threshold, invalidate session
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout {
instance_id: index,
@@ -273,55 +390,138 @@ impl ChromeDriverPool {
});
}
let error_count = self.hard_reset_controller.record_error();
crate::util::logger::log_error("Navigation timeout (60s)").await;
Err(anyhow!("Navigation timeout"))
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
// Enhanced logging
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Timeout error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Navigation timeout. Threshold reached ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Navigation timeout. Hard reset at {}/{}",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
}
};
{
let mut inst = instance.lock().await;
inst.task_count = inst.task_count.saturating_sub(1);
}
result
}
async fn get_rotated_index(&self) -> Result<usize> {
let total = self.instances.len();
let half_size = total / 2;
/// Simple round-robin instance selection (no rotation)
async fn select_instance_round_robin(&self) -> Arc<Mutex<ChromeInstance>> {
let mut next = self.next_instance.lock().await;
let index = *next;
*next = (*next + 1) % self.instances.len();
drop(next);
Arc::clone(&self.instances[index])
}
/// Round-robin with half-pool rotation
async fn select_instance_with_rotation(&self) -> Result<Arc<Mutex<ChromeInstance>>> {
let pool_size = self.instances.len();
let half_size = pool_size / 2;
if half_size == 0 {
return Ok(0); // Pool zu klein für Rotation
// Pool too small for rotation, fall back to simple round-robin
return Ok(self.select_instance_round_robin().await);
}
let mut next_idx = self.next_instance.lock().await;
let current_half_start = if *next_idx < half_size { 0 } else { half_size };
let current_half_end = if *next_idx < half_size { half_size } else { total };
let mut next = self.next_instance.lock().await;
let current_half_start = (*next / half_size) * half_size;
let current_half_end = (current_half_start + half_size).min(pool_size);
// Suche verfügbare Instanz in aktueller Hälfte
for offset in 0..(current_half_end - current_half_start) {
let candidate_idx = current_half_start + ((*next_idx + offset) % half_size);
// Try to find available instance in current half
let mut attempts = 0;
let max_attempts = half_size * 2; // Try both halves
while attempts < max_attempts {
let index = current_half_start + (*next % half_size);
let instance = &self.instances[index];
let instance = &self.instances[candidate_idx];
let guard = instance.lock().await;
// Check if instance can accept more tasks
let mut inst = instance.lock().await;
let can_accept = inst.get_task_count() < inst.max_tasks_per_instance;
drop(inst);
if guard.max_tasks_per_instance == 0 ||
guard.task_count < guard.max_tasks_per_instance {
*next_idx = (candidate_idx + 1) % total;
drop(guard);
return Ok(candidate_idx);
if can_accept {
*next = (*next + 1) % pool_size;
drop(next);
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::InstanceSelected {
instance_id: index,
half: if index < half_size { 1 } else { 2 },
});
}
return Ok(Arc::clone(instance));
}
// Current half saturated, try other half
if attempts == half_size - 1 {
logger::log_info("Current half saturated, rotating to other half").await;
*next = if current_half_start == 0 { half_size } else { 0 };
} else {
*next = (*next + 1) % pool_size;
}
attempts += 1;
}
// Aktuelle Hälfte voll → Zur anderen wechseln
crate::util::logger::log_info("Current half saturated, rotating to other half").await;
drop(next);
let new_half_start = if current_half_start == 0 { half_size } else { 0 };
let new_half_end = if current_half_start == 0 { total } else { half_size };
// Alte Hälfte zurücksetzen (für nächste Rotation)
for i in current_half_start..current_half_end {
let mut instance = self.instances[i].lock().await;
instance.reset_task_count();
}
*next_idx = new_half_start;
drop(next_idx);
Ok(new_half_start)
// All instances saturated
Err(anyhow!("All instances at task capacity"))
}
pub fn get_reset_controller(&self) -> Arc<HardResetController> {
Arc::clone(&self.hard_reset_controller)
}
/// Check if hard reset threshold has been reached
pub fn should_perform_hard_reset(&self) -> bool {
self.hard_reset_controller.get_count() >= Self::HARD_RESET_ERROR_THRESHOLD
}
/// Get current error count and threshold for monitoring
pub fn get_reset_status(&self) -> (usize, usize) {
(
self.hard_reset_controller.get_count(),
Self::HARD_RESET_ERROR_THRESHOLD
)
}
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
@@ -369,7 +569,7 @@ pub struct ChromeInstance {
current_session: Arc<Mutex<Option<Client>>>, // Current active session
session_request_count: Arc<Mutex<usize>>,
max_requests_per_session: usize, // z.B. 25
max_requests_per_session: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
@@ -411,8 +611,6 @@ impl ChromeInstance {
pub async fn get_or_renew_session(&self) -> Result<Client> {
let mut session_opt = self.current_session.lock().await;
let mut request_count = self.session_request_count.lock().await;
let old_request_count = *request_count;
// Session erneuern wenn:
// 1. Keine Session vorhanden
@@ -476,7 +674,7 @@ impl ChromeInstance {
mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed {
instance_id: self.instance_id,
old_request_count: *request_count,
reason: crate::monitoring::RenewalReason::RequestLimit,
reason: reason,
new_proxy: new_proxy_info,
});
}
@@ -490,15 +688,21 @@ impl ChromeInstance {
}
async fn create_fresh_session(&self) -> Result<Client> {
// Hole aktuellen Proxy-URL ohne self zu mutieren
let proxy_url = if let Some(ref pool) = self.proxy_pool {
let mut proxy_idx = self.current_proxy_index.lock().await;
*proxy_idx = (*proxy_idx + 1) % pool.num_proxies();
let url = pool.get_proxy_url(*proxy_idx);
let num_proxies = pool.num_proxies();
crate::util::logger::log_info(&format!(
"Using proxy {} for new session",
*proxy_idx
// Round-robin through all proxies
let selected_proxy = *proxy_idx % num_proxies;
*proxy_idx = (*proxy_idx + 1) % num_proxies;
let url = pool.get_proxy_url(selected_proxy);
logger::log_info(&format!(
"Instance {} creating session with proxy {}/{} (rotation)",
self.instance_id,
selected_proxy,
num_proxies
)).await;
Some(url)
@@ -516,38 +720,19 @@ impl ChromeInstance {
.context("Failed to connect to ChromeDriver")
}
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
let mut args = vec![
"--headless=new".to_string(),
"--disable-gpu".to_string(),
"--no-sandbox".to_string(),
"--disable-dev-shm-usage".to_string(),
"--disable-infobars".to_string(),
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
format!("--user-agent={}", user_agent),
];
pub async fn invalidate_current_session(&self) {
let mut session_opt = self.current_session.lock().await;
if let Some(proxy) = proxy_url {
args.push(format!("--proxy-server={}", proxy));
if let Some(old_session) = session_opt.take() {
crate::util::logger::log_info(&format!(
"Invalidating broken session for instance {}",
self.instance_id
)).await;
let _ = old_session.close().await;
}
let caps = serde_json::json!({
"goog:chromeOptions": {
"args": args,
"excludeSwitches": ["enable-logging", "enable-automation"],
"prefs": {
"profile.default_content_setting_values.notifications": 2
}
}
});
caps.as_object().cloned().unwrap()
let mut request_count = self.session_request_count.lock().await;
*request_count = 0;
}
pub fn reset_task_count(&mut self) {
@@ -578,6 +763,20 @@ impl ChromeInstance {
Ok(())
}
pub fn is_available(&self) -> bool {
if self.max_tasks_per_instance == 0 {
return true; // No limit
}
self.task_count < self.max_tasks_per_instance
}
pub fn tasks_remaining(&self) -> usize {
if self.max_tasks_per_instance == 0 {
return usize::MAX;
}
self.max_tasks_per_instance.saturating_sub(self.task_count)
}
/// Spawns the actual `chromedriver` binary and waits for it to become ready.
async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> {
let mut process = Command::new("chromedriver-win64/chromedriver.exe")
@@ -624,6 +823,40 @@ impl ChromeInstance {
Err(anyhow!("ChromeDriver failed to start within 30s"))
}
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
let mut args = vec![
"--headless=new".to_string(),
"--disable-gpu".to_string(),
"--no-sandbox".to_string(),
"--disable-dev-shm-usage".to_string(),
"--disable-infobars".to_string(),
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
format!("--user-agent={}", user_agent),
];
if let Some(proxy) = proxy_url {
args.push(format!("--proxy-server={}", proxy));
}
let caps = serde_json::json!({
"goog:chromeOptions": {
"args": args,
"excludeSwitches": ["enable-logging", "enable-automation"],
"prefs": {
"profile.default_content_setting_values.notifications": 2
}
}
});
caps.as_object().cloned().unwrap()
}
pub fn chrome_user_agent() -> &'static str {
static UAS: &[&str] = &[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",