added atomic writer action for ctr c abort
This commit is contained in:
@@ -1,10 +1,8 @@
|
||||
// src/scraper/webdriver.rs
|
||||
use super::helpers::*;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use fantoccini::{Client, ClientBuilder};
|
||||
use rand::seq::{IndexedRandom};
|
||||
use rand::rngs::ThreadRng;
|
||||
use rand::Rng; // for the RNG trait
|
||||
use serde_json::{Map, Value};
|
||||
use std::pin::Pin;
|
||||
use std::process::Stdio;
|
||||
@@ -16,6 +14,7 @@ use tokio::task::JoinHandle;
|
||||
use tokio::sync::{Mutex, Semaphore};
|
||||
use tokio::time::{sleep, timeout, Duration};
|
||||
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
|
||||
use crate::Config;
|
||||
|
||||
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
|
||||
pub struct ChromeDriverPool {
|
||||
@@ -30,25 +29,28 @@ pub struct ChromeDriverPool {
|
||||
|
||||
last_request_time: Arc<Mutex<Instant>>,
|
||||
min_request_interval_ms: u64,
|
||||
|
||||
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||
}
|
||||
|
||||
impl ChromeDriverPool {
|
||||
/// Creates a new pool without any proxy (direct connection).
|
||||
pub async fn _new(pool_size: usize) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
|
||||
pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
|
||||
}
|
||||
|
||||
/// Creates a new pool with task-per-instance limit but no proxy.
|
||||
pub async fn _new_with_task_limit(pool_size: usize, max_tasks_per_instance: usize) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, None, max_tasks_per_instance).await
|
||||
pub async fn _new_with_task_limit(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
|
||||
}
|
||||
|
||||
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
|
||||
pub async fn new_with_proxy(
|
||||
pool_size: usize,
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
config: &Config,
|
||||
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||
) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
|
||||
Self::new_with_proxy_and_task_limit(proxy_pool, config, monitoring).await
|
||||
}
|
||||
|
||||
/// Full constructor: supports proxy + task limiting + rotation.
|
||||
@@ -62,10 +64,13 @@ impl ChromeDriverPool {
|
||||
///
|
||||
/// Uses the minimum of these constraints to determine actual pool size.
|
||||
pub async fn new_with_proxy_and_task_limit(
|
||||
pool_size_limit: usize,
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
max_tasks_per_instance: usize,
|
||||
config: &Config,
|
||||
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||
) -> Result<Self> {
|
||||
let pool_size_limit = config.max_parallel_instances;
|
||||
let task_per_instance_limit = config.max_tasks_per_instance;
|
||||
|
||||
// Determine actual pool size based on available resources
|
||||
let actual_pool_size = if let Some(ref pp) = proxy_pool {
|
||||
let available_proxies = pp.num_proxies();
|
||||
@@ -79,7 +84,7 @@ impl ChromeDriverPool {
|
||||
}
|
||||
|
||||
// Rotation is enabled when task limiting is active
|
||||
let rotation_enabled = max_tasks_per_instance > 0;
|
||||
let rotation_enabled = task_per_instance_limit > 0;
|
||||
|
||||
let mut instances = Vec::with_capacity(actual_pool_size);
|
||||
|
||||
@@ -102,13 +107,61 @@ impl ChromeDriverPool {
|
||||
let instance = ChromeInstance::new(
|
||||
proxy_pool.clone(), // Clone the Arc
|
||||
i, // This instance's proxy index
|
||||
max_tasks_per_instance
|
||||
config,
|
||||
monitoring.clone(),
|
||||
).await?;
|
||||
|
||||
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
||||
instances.push(Arc::new(Mutex::new(instance)));
|
||||
}
|
||||
|
||||
// Emit instance created events
|
||||
for (i, instance) in instances.iter().enumerate() {
|
||||
if let Some(ref mon) = monitoring {
|
||||
let guard = instance.lock().await;
|
||||
|
||||
// Extract proxy info if available
|
||||
let proxy_info = if let Some(ref pp) = proxy_pool {
|
||||
pp.get_proxy_info(i % pp.num_proxies())
|
||||
} else {
|
||||
guard.proxy_url.as_ref().and_then(|url| {
|
||||
// Parse proxy URL manually if no pool
|
||||
// Format: socks5://localhost:10801
|
||||
if let Some(port_str) = url.split(':').last() {
|
||||
if let Ok(port) = port_str.parse::<u16>() {
|
||||
return Some(crate::monitoring::ProxyInfo {
|
||||
container_name: format!("proxy-{}", i),
|
||||
ip_address: "127.0.0.1".to_string(),
|
||||
port,
|
||||
status: crate::monitoring::ProxyStatus::Connected,
|
||||
});
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
};
|
||||
|
||||
mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated {
|
||||
instance_id: i,
|
||||
max_tasks: guard.max_tasks_per_instance,
|
||||
proxy: proxy_info.clone(), // ✅ Now includes actual proxy info
|
||||
});
|
||||
|
||||
// Also emit ProxyConnected event if proxy exists
|
||||
if let Some(ref proxy) = proxy_info {
|
||||
mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected {
|
||||
container_name: proxy.container_name.clone(),
|
||||
ip_address: proxy.ip_address.clone(),
|
||||
port: proxy.port,
|
||||
});
|
||||
}
|
||||
|
||||
drop(guard);
|
||||
}
|
||||
}
|
||||
|
||||
let min_request_interval_ms = config.min_request_interval_ms;
|
||||
|
||||
Ok(Self {
|
||||
instances,
|
||||
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
|
||||
@@ -116,7 +169,8 @@ impl ChromeDriverPool {
|
||||
rotation_enabled,
|
||||
next_instance: Arc::new(Mutex::new(0)),
|
||||
last_request_time: Arc::new(Mutex::new(Instant::now())),
|
||||
min_request_interval_ms: 300,
|
||||
min_request_interval_ms,
|
||||
monitoring,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -145,13 +199,25 @@ impl ChromeDriverPool {
|
||||
}
|
||||
}
|
||||
|
||||
let random_index = random_range(0, self.instances.len() as u64) as usize;
|
||||
// Index-Auswahl (vereinfacht, siehe unten für vollständige Rotation)
|
||||
let index = if self.rotation_enabled {
|
||||
self.get_rotated_index().await?
|
||||
} else {
|
||||
rand::rng().random_range(0..self.instances.len())
|
||||
random_index
|
||||
};
|
||||
|
||||
if let Some(ref mon) = self.monitoring {
|
||||
mon.emit(crate::monitoring::MonitoringEvent::TaskStarted {
|
||||
instance_id: index,
|
||||
url: url.clone(),
|
||||
});
|
||||
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
||||
instance_id: index,
|
||||
status: crate::monitoring::InstanceStatusChange::Active,
|
||||
});
|
||||
}
|
||||
|
||||
let instance = &self.instances[index];
|
||||
let mut guard = instance.lock().await;
|
||||
|
||||
@@ -168,6 +234,8 @@ impl ChromeDriverPool {
|
||||
|
||||
drop(guard); // Lock freigeben vor Navigation
|
||||
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Navigation mit Timeout
|
||||
let navigation_result = timeout(
|
||||
Duration::from_secs(60),
|
||||
@@ -176,8 +244,20 @@ impl ChromeDriverPool {
|
||||
|
||||
match navigation_result {
|
||||
Ok(Ok(_)) => {
|
||||
if let Some(ref mon) = self.monitoring {
|
||||
mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted {
|
||||
instance_id: index,
|
||||
success: navigation_result.is_ok(),
|
||||
duration_ms: start_time.elapsed().as_millis() as u64,
|
||||
error: navigation_result.as_ref().err().map(|e| e.to_string()),
|
||||
});
|
||||
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
||||
instance_id: index,
|
||||
status: crate::monitoring::InstanceStatusChange::Idle,
|
||||
});
|
||||
}
|
||||
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
|
||||
|
||||
|
||||
// Parse-Funktion ausführen
|
||||
parse(client).await
|
||||
}
|
||||
@@ -186,6 +266,13 @@ impl ChromeDriverPool {
|
||||
Err(anyhow!("Navigation failed: {}", e))
|
||||
}
|
||||
Err(_) => {
|
||||
if let Some(ref mon) = self.monitoring {
|
||||
mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout {
|
||||
instance_id: index,
|
||||
url: url.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
crate::util::logger::log_error("Navigation timeout (60s)").await;
|
||||
Err(anyhow!("Navigation timeout"))
|
||||
}
|
||||
@@ -285,18 +372,21 @@ pub struct ChromeInstance {
|
||||
max_requests_per_session: usize, // z.B. 25
|
||||
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool
|
||||
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
|
||||
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
|
||||
|
||||
instance_id: usize,
|
||||
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||
}
|
||||
|
||||
impl ChromeInstance {
|
||||
pub async fn new(
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
initial_proxy_index: usize,
|
||||
max_tasks_per_instance: usize) -> Result<Self> {
|
||||
pub async fn new(proxy_pool: Option<Arc<DockerVpnProxyPool>>, instance_id: usize, config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>) -> Result<Self> {
|
||||
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
||||
|
||||
// Get proxy URL if proxy pool is provided
|
||||
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(initial_proxy_index));
|
||||
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(instance_id));
|
||||
|
||||
let max_tasks_per_instance = config.max_tasks_per_instance;
|
||||
let max_requests_per_session = config.max_requests_per_session;
|
||||
|
||||
Ok(Self {
|
||||
base_url,
|
||||
@@ -308,16 +398,21 @@ impl ChromeInstance {
|
||||
|
||||
current_session: Arc::new(Mutex::new(None)),
|
||||
session_request_count: Arc::new(Mutex::new(0)),
|
||||
max_requests_per_session: 25, // Konfigurierbar machen!
|
||||
max_requests_per_session,
|
||||
|
||||
proxy_pool,
|
||||
current_proxy_index: Arc::new(Mutex::new(initial_proxy_index)),
|
||||
current_proxy_index: Arc::new(Mutex::new(instance_id)),
|
||||
|
||||
instance_id,
|
||||
monitoring,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_or_renew_session(&self) -> Result<Client> {
|
||||
let mut session_opt = self.current_session.lock().await;
|
||||
let mut request_count = self.session_request_count.lock().await;
|
||||
|
||||
let old_request_count = *request_count;
|
||||
|
||||
// Session erneuern wenn:
|
||||
// 1. Keine Session vorhanden
|
||||
@@ -325,12 +420,20 @@ impl ChromeInstance {
|
||||
let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session;
|
||||
|
||||
if needs_renewal {
|
||||
if let Some(ref mon) = self.monitoring {
|
||||
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
||||
instance_id: self.instance_id,
|
||||
status: crate::monitoring::InstanceStatusChange::Renewing,
|
||||
});
|
||||
}
|
||||
|
||||
// Alte Session schließen
|
||||
if let Some(old_session) = session_opt.take() {
|
||||
crate::util::logger::log_info("Closing old session").await;
|
||||
let _ = old_session.close().await;
|
||||
// Kurze Pause zwischen Sessions
|
||||
sleep(Duration::from_millis(rand::rng().random_range(500..1000))).await;
|
||||
let random_delay = random_range(500, 1000);
|
||||
sleep(Duration::from_millis(random_delay)).await;
|
||||
}
|
||||
|
||||
// Neue Session mit frischem User-Agent erstellen
|
||||
@@ -342,6 +445,41 @@ impl ChromeInstance {
|
||||
let new_session = self.create_fresh_session().await?;
|
||||
*session_opt = Some(new_session.clone());
|
||||
*request_count = 0;
|
||||
|
||||
if let Some(ref mon) = self.monitoring {
|
||||
let reason = if *request_count >= self.max_requests_per_session {
|
||||
crate::monitoring::RenewalReason::RequestLimit
|
||||
} else {
|
||||
crate::monitoring::RenewalReason::TaskLimit
|
||||
};
|
||||
|
||||
// Get updated proxy info
|
||||
let new_proxy_info = if let Some(ref pp) = self.proxy_pool {
|
||||
let proxy_idx = *self.current_proxy_index.lock().await;
|
||||
pp.get_proxy_info(proxy_idx)
|
||||
} else {
|
||||
self.proxy_url.as_ref().and_then(|url| {
|
||||
if let Some(port_str) = url.split(':').last() {
|
||||
if let Ok(port) = port_str.parse::<u16>() {
|
||||
return Some(crate::monitoring::ProxyInfo {
|
||||
container_name: format!("proxy-{}", self.instance_id),
|
||||
ip_address: "127.0.0.1".to_string(),
|
||||
port,
|
||||
status: crate::monitoring::ProxyStatus::Connected,
|
||||
});
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
};
|
||||
|
||||
mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed {
|
||||
instance_id: self.instance_id,
|
||||
old_request_count: *request_count,
|
||||
reason: crate::monitoring::RenewalReason::RequestLimit,
|
||||
new_proxy: new_proxy_info,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(new_session)
|
||||
} else {
|
||||
@@ -412,11 +550,6 @@ impl ChromeInstance {
|
||||
caps.as_object().cloned().unwrap()
|
||||
}
|
||||
|
||||
pub async fn new_session(&self) -> Result<Client> {
|
||||
// Für Backward-Compatibility, aber sollte get_or_renew_session() nutzen!
|
||||
self.create_fresh_session().await
|
||||
}
|
||||
|
||||
pub fn reset_task_count(&mut self) {
|
||||
self.task_count = 0;
|
||||
}
|
||||
@@ -491,52 +624,15 @@ impl ChromeInstance {
|
||||
Err(anyhow!("ChromeDriver failed to start within 30s"))
|
||||
}
|
||||
|
||||
fn chrome_args(&self) -> Map<String, Value> {
|
||||
let user_agent = Self::chrome_user_agent();
|
||||
let mut args = vec![
|
||||
"--headless=new".to_string(),
|
||||
"--disable-gpu".to_string(),
|
||||
"--no-sandbox".to_string(),
|
||||
"--disable-dev-shm-usage".to_string(),
|
||||
"--disable-infobars".to_string(),
|
||||
"--disable-extensions".to_string(),
|
||||
"--disable-popup-blocking".to_string(),
|
||||
"--disable-notifications".to_string(),
|
||||
//"--disable-logging".to_string(),
|
||||
"--disable-autofill".to_string(),
|
||||
"--disable-sync".to_string(),
|
||||
"--disable-default-apps".to_string(),
|
||||
"--disable-translate".to_string(),
|
||||
//"--window-size=1920,1080".to_string(),
|
||||
"--disable-blink-features=AutomationControlled".to_string(),
|
||||
format!("--user-agent={}", user_agent),
|
||||
];
|
||||
if let Some(ref proxy) = self.proxy_url {
|
||||
let proxy = proxy.clone();
|
||||
let proxy_formatted = format!("--proxy-server={}", proxy);
|
||||
args.push(proxy_formatted);
|
||||
}
|
||||
let caps = serde_json::json!({
|
||||
"goog:chromeOptions": {
|
||||
"args": args,
|
||||
"excludeSwitches": ["enable-logging", "enable-automation"],
|
||||
"prefs": {
|
||||
"profile.default_content_setting_values.notifications": 2
|
||||
}
|
||||
}
|
||||
});
|
||||
caps.as_object().cloned().unwrap()
|
||||
}
|
||||
|
||||
pub fn chrome_user_agent() -> &'static str {
|
||||
static UAS: &[&str] = &[
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
|
||||
];
|
||||
|
||||
let mut rng = ThreadRng::default(); // non-deprecated RNG
|
||||
*UAS.choose(&mut rng).unwrap()
|
||||
let random_user_agent = choose_random(UAS);
|
||||
random_user_agent
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user