added atomic writer action for ctr c abort

This commit is contained in:
2025-12-19 14:12:56 +01:00
parent cd91de253b
commit b366f366e6
26 changed files with 3317 additions and 666 deletions

View File

@@ -1,10 +1,8 @@
// src/scraper/webdriver.rs
use super::helpers::*;
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, ClientBuilder};
use rand::seq::{IndexedRandom};
use rand::rngs::ThreadRng;
use rand::Rng; // for the RNG trait
use serde_json::{Map, Value};
use std::pin::Pin;
use std::process::Stdio;
@@ -16,6 +14,7 @@ use tokio::task::JoinHandle;
use tokio::sync::{Mutex, Semaphore};
use tokio::time::{sleep, timeout, Duration};
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
use crate::Config;
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
pub struct ChromeDriverPool {
@@ -30,25 +29,28 @@ pub struct ChromeDriverPool {
last_request_time: Arc<Mutex<Instant>>,
min_request_interval_ms: u64,
monitoring: Option<crate::monitoring::MonitoringHandle>,
}
impl ChromeDriverPool {
/// Creates a new pool without any proxy (direct connection).
pub async fn _new(pool_size: usize) -> Result<Self> {
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
}
/// Creates a new pool with task-per-instance limit but no proxy.
pub async fn _new_with_task_limit(pool_size: usize, max_tasks_per_instance: usize) -> Result<Self> {
Self::new_with_proxy_and_task_limit(pool_size, None, max_tasks_per_instance).await
pub async fn _new_with_task_limit(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
}
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
pub async fn new_with_proxy(
pool_size: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
config: &Config,
monitoring: Option<crate::monitoring::MonitoringHandle>,
) -> Result<Self> {
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
Self::new_with_proxy_and_task_limit(proxy_pool, config, monitoring).await
}
/// Full constructor: supports proxy + task limiting + rotation.
@@ -62,10 +64,13 @@ impl ChromeDriverPool {
///
/// Uses the minimum of these constraints to determine actual pool size.
pub async fn new_with_proxy_and_task_limit(
pool_size_limit: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
max_tasks_per_instance: usize,
config: &Config,
monitoring: Option<crate::monitoring::MonitoringHandle>,
) -> Result<Self> {
let pool_size_limit = config.max_parallel_instances;
let task_per_instance_limit = config.max_tasks_per_instance;
// Determine actual pool size based on available resources
let actual_pool_size = if let Some(ref pp) = proxy_pool {
let available_proxies = pp.num_proxies();
@@ -79,7 +84,7 @@ impl ChromeDriverPool {
}
// Rotation is enabled when task limiting is active
let rotation_enabled = max_tasks_per_instance > 0;
let rotation_enabled = task_per_instance_limit > 0;
let mut instances = Vec::with_capacity(actual_pool_size);
@@ -102,13 +107,61 @@ impl ChromeDriverPool {
let instance = ChromeInstance::new(
proxy_pool.clone(), // Clone the Arc
i, // This instance's proxy index
max_tasks_per_instance
config,
monitoring.clone(),
).await?;
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
instances.push(Arc::new(Mutex::new(instance)));
}
// Emit instance created events
for (i, instance) in instances.iter().enumerate() {
if let Some(ref mon) = monitoring {
let guard = instance.lock().await;
// Extract proxy info if available
let proxy_info = if let Some(ref pp) = proxy_pool {
pp.get_proxy_info(i % pp.num_proxies())
} else {
guard.proxy_url.as_ref().and_then(|url| {
// Parse proxy URL manually if no pool
// Format: socks5://localhost:10801
if let Some(port_str) = url.split(':').last() {
if let Ok(port) = port_str.parse::<u16>() {
return Some(crate::monitoring::ProxyInfo {
container_name: format!("proxy-{}", i),
ip_address: "127.0.0.1".to_string(),
port,
status: crate::monitoring::ProxyStatus::Connected,
});
}
}
None
})
};
mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated {
instance_id: i,
max_tasks: guard.max_tasks_per_instance,
proxy: proxy_info.clone(), // ✅ Now includes actual proxy info
});
// Also emit ProxyConnected event if proxy exists
if let Some(ref proxy) = proxy_info {
mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected {
container_name: proxy.container_name.clone(),
ip_address: proxy.ip_address.clone(),
port: proxy.port,
});
}
drop(guard);
}
}
let min_request_interval_ms = config.min_request_interval_ms;
Ok(Self {
instances,
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
@@ -116,7 +169,8 @@ impl ChromeDriverPool {
rotation_enabled,
next_instance: Arc::new(Mutex::new(0)),
last_request_time: Arc::new(Mutex::new(Instant::now())),
min_request_interval_ms: 300,
min_request_interval_ms,
monitoring,
})
}
@@ -145,13 +199,25 @@ impl ChromeDriverPool {
}
}
let random_index = random_range(0, self.instances.len() as u64) as usize;
// Index-Auswahl (vereinfacht, siehe unten für vollständige Rotation)
let index = if self.rotation_enabled {
self.get_rotated_index().await?
} else {
rand::rng().random_range(0..self.instances.len())
random_index
};
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::TaskStarted {
instance_id: index,
url: url.clone(),
});
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
instance_id: index,
status: crate::monitoring::InstanceStatusChange::Active,
});
}
let instance = &self.instances[index];
let mut guard = instance.lock().await;
@@ -168,6 +234,8 @@ impl ChromeDriverPool {
drop(guard); // Lock freigeben vor Navigation
let start_time = Instant::now();
// Navigation mit Timeout
let navigation_result = timeout(
Duration::from_secs(60),
@@ -176,8 +244,20 @@ impl ChromeDriverPool {
match navigation_result {
Ok(Ok(_)) => {
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted {
instance_id: index,
success: navigation_result.is_ok(),
duration_ms: start_time.elapsed().as_millis() as u64,
error: navigation_result.as_ref().err().map(|e| e.to_string()),
});
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
instance_id: index,
status: crate::monitoring::InstanceStatusChange::Idle,
});
}
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
// Parse-Funktion ausführen
parse(client).await
}
@@ -186,6 +266,13 @@ impl ChromeDriverPool {
Err(anyhow!("Navigation failed: {}", e))
}
Err(_) => {
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout {
instance_id: index,
url: url.clone(),
});
}
crate::util::logger::log_error("Navigation timeout (60s)").await;
Err(anyhow!("Navigation timeout"))
}
@@ -285,18 +372,21 @@ pub struct ChromeInstance {
max_requests_per_session: usize, // z.B. 25
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
instance_id: usize,
monitoring: Option<crate::monitoring::MonitoringHandle>,
}
impl ChromeInstance {
pub async fn new(
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
initial_proxy_index: usize,
max_tasks_per_instance: usize) -> Result<Self> {
pub async fn new(proxy_pool: Option<Arc<DockerVpnProxyPool>>, instance_id: usize, config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>) -> Result<Self> {
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
// Get proxy URL if proxy pool is provided
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(initial_proxy_index));
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(instance_id));
let max_tasks_per_instance = config.max_tasks_per_instance;
let max_requests_per_session = config.max_requests_per_session;
Ok(Self {
base_url,
@@ -308,16 +398,21 @@ impl ChromeInstance {
current_session: Arc::new(Mutex::new(None)),
session_request_count: Arc::new(Mutex::new(0)),
max_requests_per_session: 25, // Konfigurierbar machen!
max_requests_per_session,
proxy_pool,
current_proxy_index: Arc::new(Mutex::new(initial_proxy_index)),
current_proxy_index: Arc::new(Mutex::new(instance_id)),
instance_id,
monitoring,
})
}
pub async fn get_or_renew_session(&self) -> Result<Client> {
let mut session_opt = self.current_session.lock().await;
let mut request_count = self.session_request_count.lock().await;
let old_request_count = *request_count;
// Session erneuern wenn:
// 1. Keine Session vorhanden
@@ -325,12 +420,20 @@ impl ChromeInstance {
let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session;
if needs_renewal {
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
instance_id: self.instance_id,
status: crate::monitoring::InstanceStatusChange::Renewing,
});
}
// Alte Session schließen
if let Some(old_session) = session_opt.take() {
crate::util::logger::log_info("Closing old session").await;
let _ = old_session.close().await;
// Kurze Pause zwischen Sessions
sleep(Duration::from_millis(rand::rng().random_range(500..1000))).await;
let random_delay = random_range(500, 1000);
sleep(Duration::from_millis(random_delay)).await;
}
// Neue Session mit frischem User-Agent erstellen
@@ -342,6 +445,41 @@ impl ChromeInstance {
let new_session = self.create_fresh_session().await?;
*session_opt = Some(new_session.clone());
*request_count = 0;
if let Some(ref mon) = self.monitoring {
let reason = if *request_count >= self.max_requests_per_session {
crate::monitoring::RenewalReason::RequestLimit
} else {
crate::monitoring::RenewalReason::TaskLimit
};
// Get updated proxy info
let new_proxy_info = if let Some(ref pp) = self.proxy_pool {
let proxy_idx = *self.current_proxy_index.lock().await;
pp.get_proxy_info(proxy_idx)
} else {
self.proxy_url.as_ref().and_then(|url| {
if let Some(port_str) = url.split(':').last() {
if let Ok(port) = port_str.parse::<u16>() {
return Some(crate::monitoring::ProxyInfo {
container_name: format!("proxy-{}", self.instance_id),
ip_address: "127.0.0.1".to_string(),
port,
status: crate::monitoring::ProxyStatus::Connected,
});
}
}
None
})
};
mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed {
instance_id: self.instance_id,
old_request_count: *request_count,
reason: crate::monitoring::RenewalReason::RequestLimit,
new_proxy: new_proxy_info,
});
}
Ok(new_session)
} else {
@@ -412,11 +550,6 @@ impl ChromeInstance {
caps.as_object().cloned().unwrap()
}
pub async fn new_session(&self) -> Result<Client> {
// Für Backward-Compatibility, aber sollte get_or_renew_session() nutzen!
self.create_fresh_session().await
}
pub fn reset_task_count(&mut self) {
self.task_count = 0;
}
@@ -491,52 +624,15 @@ impl ChromeInstance {
Err(anyhow!("ChromeDriver failed to start within 30s"))
}
fn chrome_args(&self) -> Map<String, Value> {
let user_agent = Self::chrome_user_agent();
let mut args = vec![
"--headless=new".to_string(),
"--disable-gpu".to_string(),
"--no-sandbox".to_string(),
"--disable-dev-shm-usage".to_string(),
"--disable-infobars".to_string(),
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
//"--disable-logging".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
//"--window-size=1920,1080".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
format!("--user-agent={}", user_agent),
];
if let Some(ref proxy) = self.proxy_url {
let proxy = proxy.clone();
let proxy_formatted = format!("--proxy-server={}", proxy);
args.push(proxy_formatted);
}
let caps = serde_json::json!({
"goog:chromeOptions": {
"args": args,
"excludeSwitches": ["enable-logging", "enable-automation"],
"prefs": {
"profile.default_content_setting_values.notifications": 2
}
}
});
caps.as_object().cloned().unwrap()
}
pub fn chrome_user_agent() -> &'static str {
static UAS: &[&str] = &[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
];
let mut rng = ThreadRng::default(); // non-deprecated RNG
*UAS.choose(&mut rng).unwrap()
let random_user_agent = choose_random(UAS);
random_user_agent
}
}