added pool rotation to chromedriver pool
This commit is contained in:
@@ -2,13 +2,14 @@
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use fantoccini::{Client, ClientBuilder};
|
||||
use rand::seq::{IndexedRandom, SliceRandom};
|
||||
use rand::seq::{IndexedRandom};
|
||||
use rand::rngs::ThreadRng;
|
||||
use rand::Rng; // for the RNG trait
|
||||
use serde_json::{Map, Value};
|
||||
use std::pin::Pin;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||
use tokio::process::{Child, Command};
|
||||
use tokio::task::JoinHandle;
|
||||
@@ -26,6 +27,9 @@ pub struct ChromeDriverPool {
|
||||
rotation_enabled: bool,
|
||||
/// Index for round-robin instance selection (when rotation is enabled)
|
||||
next_instance: Arc<Mutex<usize>>,
|
||||
|
||||
last_request_time: Arc<Mutex<Instant>>,
|
||||
min_request_interval_ms: u64,
|
||||
}
|
||||
|
||||
impl ChromeDriverPool {
|
||||
@@ -94,11 +98,12 @@ impl ChromeDriverPool {
|
||||
}
|
||||
|
||||
for i in 0..actual_pool_size {
|
||||
let proxy_url = proxy_pool
|
||||
.as_ref()
|
||||
.map(|pp| pp.get_proxy_url(i));
|
||||
|
||||
let instance = ChromeInstance::new(proxy_url, max_tasks_per_instance).await?;
|
||||
// Pass the entire proxy_pool and the index
|
||||
let instance = ChromeInstance::new(
|
||||
proxy_pool.clone(), // Clone the Arc
|
||||
i, // This instance's proxy index
|
||||
max_tasks_per_instance
|
||||
).await?;
|
||||
|
||||
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
||||
instances.push(Arc::new(Mutex::new(instance)));
|
||||
@@ -110,18 +115,11 @@ impl ChromeDriverPool {
|
||||
proxy_pool,
|
||||
rotation_enabled,
|
||||
next_instance: Arc::new(Mutex::new(0)),
|
||||
last_request_time: Arc::new(Mutex::new(Instant::now())),
|
||||
min_request_interval_ms: 300,
|
||||
})
|
||||
}
|
||||
|
||||
/// Execute a scraping task using an available instance from the pool.
|
||||
///
|
||||
/// When rotation is enabled:
|
||||
/// - Uses only half of the instances at a time
|
||||
/// - Rotates to the other half when an instance reaches its task limit
|
||||
/// - Cycles through instances in round-robin fashion within the active half
|
||||
///
|
||||
/// When rotation is disabled:
|
||||
/// - Uses all instances with random selection
|
||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||
where
|
||||
T: Send + 'static,
|
||||
@@ -130,108 +128,113 @@ impl ChromeDriverPool {
|
||||
{
|
||||
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
||||
|
||||
let index = if self.rotation_enabled {
|
||||
// Rotation mode: use only half of instances at a time
|
||||
let total_instances = self.instances.len();
|
||||
let half_size = (total_instances + 1) / 2; // Round up for odd numbers
|
||||
{
|
||||
let mut last_time = self.last_request_time.lock().await;
|
||||
let elapsed = last_time.elapsed().as_millis() as u64;
|
||||
|
||||
let mut next_idx = self.next_instance.lock().await;
|
||||
let base_idx = *next_idx;
|
||||
let mut selected_idx = base_idx;
|
||||
let mut found_in_current_half = false;
|
||||
|
||||
// Try to find an available instance in the current half
|
||||
for offset in 0..half_size {
|
||||
let candidate_idx = (base_idx + offset) % half_size;
|
||||
if elapsed < self.min_request_interval_ms {
|
||||
let wait_ms = self.min_request_interval_ms - elapsed;
|
||||
drop(last_time); // Lock vor Sleep freigeben!
|
||||
|
||||
// Check if this instance has reached its task limit
|
||||
let instance = &self.instances[candidate_idx];
|
||||
let guard = instance.lock().await;
|
||||
sleep(Duration::from_millis(wait_ms)).await;
|
||||
|
||||
if guard.max_tasks_per_instance == 0 ||
|
||||
guard.task_count < guard.max_tasks_per_instance {
|
||||
// This instance is available
|
||||
*next_idx = (candidate_idx + 1) % half_size;
|
||||
selected_idx = candidate_idx;
|
||||
found_in_current_half = true;
|
||||
drop(guard);
|
||||
break;
|
||||
} else {
|
||||
drop(guard);
|
||||
}
|
||||
let mut last_time = self.last_request_time.lock().await;
|
||||
*last_time = Instant::now();
|
||||
} else {
|
||||
*last_time = Instant::now();
|
||||
}
|
||||
|
||||
if !found_in_current_half {
|
||||
// All instances in current half are at limit, switch to other half
|
||||
crate::util::logger::log_info(
|
||||
"Current half saturated, rotating to other half of instances"
|
||||
).await;
|
||||
|
||||
let other_half_start = half_size;
|
||||
let other_half_size = total_instances - half_size;
|
||||
|
||||
// Find available instance in other half
|
||||
let mut found_in_other_half = false;
|
||||
for offset in 0..other_half_size {
|
||||
let candidate_idx = other_half_start + offset;
|
||||
|
||||
let instance = &self.instances[candidate_idx];
|
||||
let guard = instance.lock().await;
|
||||
|
||||
if guard.max_tasks_per_instance == 0 ||
|
||||
guard.task_count < guard.max_tasks_per_instance {
|
||||
// Switch to this half for future requests
|
||||
*next_idx = offset;
|
||||
selected_idx = candidate_idx;
|
||||
found_in_other_half = true;
|
||||
drop(guard);
|
||||
break;
|
||||
} else {
|
||||
drop(guard);
|
||||
}
|
||||
}
|
||||
|
||||
if !found_in_other_half {
|
||||
// All instances saturated - use round-robin anyway
|
||||
selected_idx = *next_idx % total_instances;
|
||||
*next_idx = (*next_idx + 1) % total_instances;
|
||||
}
|
||||
}
|
||||
|
||||
drop(next_idx);
|
||||
selected_idx
|
||||
} else {
|
||||
// Non-rotation mode: random selection as before
|
||||
rand::random_range(..self.instances.len())
|
||||
};
|
||||
|
||||
let instance = self.instances[index].clone();
|
||||
let mut guard = instance.lock().await;
|
||||
|
||||
guard.increment_task_count();
|
||||
|
||||
if guard.max_tasks_per_instance > 0 {
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Instance {} task count: {}/{}",
|
||||
index,
|
||||
guard.get_task_count(),
|
||||
guard.max_tasks_per_instance
|
||||
))
|
||||
.await;
|
||||
}
|
||||
|
||||
let client = guard.new_session().await?;
|
||||
// Index-Auswahl (vereinfacht, siehe unten für vollständige Rotation)
|
||||
let index = if self.rotation_enabled {
|
||||
self.get_rotated_index().await?
|
||||
} else {
|
||||
rand::rng().random_range(0..self.instances.len())
|
||||
};
|
||||
|
||||
drop(guard); // release lock early
|
||||
let instance = &self.instances[index];
|
||||
let mut guard = instance.lock().await;
|
||||
|
||||
// NEU: Session mit automatischer Erneuerung holen!
|
||||
let client = guard.get_or_renew_session().await?;
|
||||
|
||||
guard.increment_task_count();
|
||||
let (task_count, session_requests) = guard.get_session_stats().await;
|
||||
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Instance {} executing task (tasks: {}/{}, session requests: {})",
|
||||
index, task_count, guard.max_tasks_per_instance, session_requests
|
||||
)).await;
|
||||
|
||||
drop(guard); // Lock freigeben vor Navigation
|
||||
|
||||
crate::util::logger::log_info(&format!("Scraping {} ...", url)).await;
|
||||
client.goto(&url).await.context("Navigation failed")?;
|
||||
// Navigation mit Timeout
|
||||
let navigation_result = timeout(
|
||||
Duration::from_secs(60),
|
||||
client.goto(&url)
|
||||
).await;
|
||||
|
||||
match navigation_result {
|
||||
Ok(Ok(_)) => {
|
||||
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
|
||||
|
||||
// Parse-Funktion ausführen
|
||||
parse(client).await
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await;
|
||||
Err(anyhow!("Navigation failed: {}", e))
|
||||
}
|
||||
Err(_) => {
|
||||
crate::util::logger::log_error("Navigation timeout (60s)").await;
|
||||
Err(anyhow!("Navigation timeout"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let result = timeout(Duration::from_secs(90), parse(client))
|
||||
.await
|
||||
.context("Parse timeout")??;
|
||||
|
||||
Ok(result)
|
||||
async fn get_rotated_index(&self) -> Result<usize> {
|
||||
let total = self.instances.len();
|
||||
let half_size = total / 2;
|
||||
|
||||
if half_size == 0 {
|
||||
return Ok(0); // Pool zu klein für Rotation
|
||||
}
|
||||
|
||||
let mut next_idx = self.next_instance.lock().await;
|
||||
let current_half_start = if *next_idx < half_size { 0 } else { half_size };
|
||||
let current_half_end = if *next_idx < half_size { half_size } else { total };
|
||||
|
||||
// Suche verfügbare Instanz in aktueller Hälfte
|
||||
for offset in 0..(current_half_end - current_half_start) {
|
||||
let candidate_idx = current_half_start + ((*next_idx + offset) % half_size);
|
||||
|
||||
let instance = &self.instances[candidate_idx];
|
||||
let guard = instance.lock().await;
|
||||
|
||||
if guard.max_tasks_per_instance == 0 ||
|
||||
guard.task_count < guard.max_tasks_per_instance {
|
||||
*next_idx = (candidate_idx + 1) % total;
|
||||
drop(guard);
|
||||
return Ok(candidate_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// Aktuelle Hälfte voll → Zur anderen wechseln
|
||||
crate::util::logger::log_info("Current half saturated, rotating to other half").await;
|
||||
|
||||
let new_half_start = if current_half_start == 0 { half_size } else { 0 };
|
||||
let new_half_end = if current_half_start == 0 { total } else { half_size };
|
||||
|
||||
// Alte Hälfte zurücksetzen (für nächste Rotation)
|
||||
for i in current_half_start..current_half_end {
|
||||
let mut instance = self.instances[i].lock().await;
|
||||
instance.reset_task_count();
|
||||
}
|
||||
|
||||
*next_idx = new_half_start;
|
||||
drop(next_idx);
|
||||
|
||||
Ok(new_half_start)
|
||||
}
|
||||
|
||||
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
||||
@@ -277,16 +280,24 @@ pub struct ChromeInstance {
|
||||
max_tasks_per_instance: usize,
|
||||
proxy_url: Option<String>,
|
||||
|
||||
// NEU: Session-Management
|
||||
current_session: Arc<Mutex<Option<Client>>>,
|
||||
current_session: Arc<Mutex<Option<Client>>>, // Current active session
|
||||
session_request_count: Arc<Mutex<usize>>,
|
||||
max_requests_per_session: usize, // z.B. 25
|
||||
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool
|
||||
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
|
||||
}
|
||||
|
||||
impl ChromeInstance {
|
||||
pub async fn new(proxy_url: Option<String>, max_tasks_per_instance: usize) -> Result<Self> {
|
||||
pub async fn new(
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
initial_proxy_index: usize,
|
||||
max_tasks_per_instance: usize) -> Result<Self> {
|
||||
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
||||
|
||||
// Get proxy URL if proxy pool is provided
|
||||
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(initial_proxy_index));
|
||||
|
||||
Ok(Self {
|
||||
base_url,
|
||||
process,
|
||||
@@ -294,10 +305,13 @@ impl ChromeInstance {
|
||||
task_count: 0,
|
||||
max_tasks_per_instance,
|
||||
proxy_url,
|
||||
// NEU
|
||||
|
||||
current_session: Arc::new(Mutex::new(None)),
|
||||
session_request_count: Arc::new(Mutex::new(0)),
|
||||
max_requests_per_session: 25, // Konfigurierbar machen!
|
||||
|
||||
proxy_pool,
|
||||
current_proxy_index: Arc::new(Mutex::new(initial_proxy_index)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -338,9 +352,24 @@ impl ChromeInstance {
|
||||
}
|
||||
|
||||
async fn create_fresh_session(&self) -> Result<Client> {
|
||||
// WICHTIG: User-Agent hier wählen, nicht in chrome_args()!
|
||||
// Hole aktuellen Proxy-URL ohne self zu mutieren
|
||||
let proxy_url = if let Some(ref pool) = self.proxy_pool {
|
||||
let mut proxy_idx = self.current_proxy_index.lock().await;
|
||||
*proxy_idx = (*proxy_idx + 1) % pool.num_proxies();
|
||||
let url = pool.get_proxy_url(*proxy_idx);
|
||||
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Using proxy {} for new session",
|
||||
*proxy_idx
|
||||
)).await;
|
||||
|
||||
Some(url)
|
||||
} else {
|
||||
self.proxy_url.clone()
|
||||
};
|
||||
|
||||
let user_agent = Self::chrome_user_agent();
|
||||
let capabilities = self.chrome_args_with_ua(user_agent);
|
||||
let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url);
|
||||
|
||||
ClientBuilder::native()
|
||||
.capabilities(capabilities)
|
||||
@@ -349,7 +378,7 @@ impl ChromeInstance {
|
||||
.context("Failed to connect to ChromeDriver")
|
||||
}
|
||||
|
||||
fn chrome_args_with_ua(&self, user_agent: &str) -> Map<String, Value> {
|
||||
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
|
||||
let mut args = vec![
|
||||
"--headless=new".to_string(),
|
||||
"--disable-gpu".to_string(),
|
||||
@@ -364,11 +393,10 @@ impl ChromeInstance {
|
||||
"--disable-default-apps".to_string(),
|
||||
"--disable-translate".to_string(),
|
||||
"--disable-blink-features=AutomationControlled".to_string(),
|
||||
// User-Agent als Parameter!
|
||||
format!("--user-agent={}", user_agent),
|
||||
];
|
||||
|
||||
if let Some(ref proxy) = self.proxy_url {
|
||||
if let Some(proxy) = proxy_url {
|
||||
args.push(format!("--proxy-server={}", proxy));
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user