added update_rule for incremental change

This commit is contained in:
2025-12-15 23:47:28 +01:00
parent d744769138
commit d26e833d93
10 changed files with 566 additions and 241 deletions

View File

@@ -19,11 +19,15 @@ pub struct ChromeDriverPool {
semaphore: Arc<Semaphore>,
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
/// Whether rotation is enabled (uses half of instances at a time)
rotation_enabled: bool,
/// Index for round-robin instance selection (when rotation is enabled)
next_instance: Arc<Mutex<usize>>,
}
impl ChromeDriverPool {
/// Creates a new pool without any proxy (direct connection).
pub async fn new(pool_size: usize) -> Result<Self> {
pub async fn _new(pool_size: usize) -> Result<Self> {
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
}
@@ -40,22 +44,53 @@ impl ChromeDriverPool {
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
}
/// Full constructor: supports proxy + task limiting.
/// Full constructor: supports proxy + task limiting + rotation.
///
/// When rotation is enabled, only half of the instances are used at once,
/// rotating to the other half when task limits are reached.
///
/// The actual pool_size is constrained by:
/// - max_parallel_instances from config (pool_size_limit parameter)
/// - Available proxies from proxy_pool (if provided)
///
/// Uses the minimum of these constraints to determine actual pool size.
pub async fn new_with_proxy_and_task_limit(
pool_size: usize,
pool_size_limit: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
max_tasks_per_instance: usize,
) -> Result<Self> {
let mut instances = Vec::with_capacity(pool_size);
// Determine actual pool size based on available resources
let actual_pool_size = if let Some(ref pp) = proxy_pool {
let available_proxies = pp.num_proxies();
pool_size_limit.min(available_proxies)
} else {
pool_size_limit
};
if actual_pool_size == 0 {
return Err(anyhow!("Pool size must be at least 1"));
}
// Rotation is enabled when task limiting is active
let rotation_enabled = max_tasks_per_instance > 0;
let mut instances = Vec::with_capacity(actual_pool_size);
crate::util::logger::log_info(&format!(
"Initializing ChromeDriver pool with {} instances{}...",
pool_size,
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" }
"Initializing ChromeDriver pool with {} instances{}{}...",
actual_pool_size,
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" },
if rotation_enabled { " with rotation enabled" } else { "" }
))
.await;
for i in 0..pool_size {
if rotation_enabled && actual_pool_size < 2 {
crate::util::logger::log_warn(
"Rotation enabled but pool has < 2 instances - rotation will be limited"
).await;
}
for i in 0..actual_pool_size {
let proxy_url = proxy_pool
.as_ref()
.map(|pp| pp.get_proxy_url(i));
@@ -68,12 +103,22 @@ impl ChromeDriverPool {
Ok(Self {
instances,
semaphore: Arc::new(Semaphore::new(pool_size)),
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
proxy_pool,
rotation_enabled,
next_instance: Arc::new(Mutex::new(0)),
})
}
/// Execute a scraping task using an available instance from the pool.
///
/// When rotation is enabled:
/// - Uses only half of the instances at a time
/// - Rotates to the other half when an instance reaches its task limit
/// - Cycles through instances in round-robin fashion within the active half
///
/// When rotation is disabled:
/// - Uses all instances with random selection
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
where
T: Send + 'static,
@@ -82,8 +127,81 @@ impl ChromeDriverPool {
{
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
// Round-robin selection
let index = rand::random_range(..self.instances.len());
let index = if self.rotation_enabled {
// Rotation mode: use only half of instances at a time
let total_instances = self.instances.len();
let half_size = (total_instances + 1) / 2; // Round up for odd numbers
let mut next_idx = self.next_instance.lock().await;
let base_idx = *next_idx;
let mut selected_idx = base_idx;
let mut found_in_current_half = false;
// Try to find an available instance in the current half
for offset in 0..half_size {
let candidate_idx = (base_idx + offset) % half_size;
// Check if this instance has reached its task limit
let instance = &self.instances[candidate_idx];
let guard = instance.lock().await;
if guard.max_tasks_per_instance == 0 ||
guard.task_count < guard.max_tasks_per_instance {
// This instance is available
*next_idx = (candidate_idx + 1) % half_size;
selected_idx = candidate_idx;
found_in_current_half = true;
drop(guard);
break;
} else {
drop(guard);
}
}
if !found_in_current_half {
// All instances in current half are at limit, switch to other half
crate::util::logger::log_info(
"Current half saturated, rotating to other half of instances"
).await;
let other_half_start = half_size;
let other_half_size = total_instances - half_size;
// Find available instance in other half
let mut found_in_other_half = false;
for offset in 0..other_half_size {
let candidate_idx = other_half_start + offset;
let instance = &self.instances[candidate_idx];
let guard = instance.lock().await;
if guard.max_tasks_per_instance == 0 ||
guard.task_count < guard.max_tasks_per_instance {
// Switch to this half for future requests
*next_idx = offset;
selected_idx = candidate_idx;
found_in_other_half = true;
drop(guard);
break;
} else {
drop(guard);
}
}
if !found_in_other_half {
// All instances saturated - use round-robin anyway
selected_idx = *next_idx % total_instances;
*next_idx = (*next_idx + 1) % total_instances;
}
}
drop(next_idx);
selected_idx
} else {
// Non-rotation mode: random selection as before
rand::random_range(..self.instances.len())
};
let instance = self.instances[index].clone();
let mut guard = instance.lock().await;
@@ -91,7 +209,8 @@ impl ChromeDriverPool {
if guard.max_tasks_per_instance > 0 {
crate::util::logger::log_info(&format!(
"Instance task count: {}/{}",
"Instance {} task count: {}/{}",
index,
guard.get_task_count(),
guard.max_tasks_per_instance
))
@@ -130,6 +249,20 @@ impl ChromeDriverPool {
pub fn get_number_of_instances(&self) -> usize {
self.instances.len()
}
/// Returns whether rotation is enabled
pub fn is_rotation_enabled(&self) -> bool {
self.rotation_enabled
}
/// Returns the size of each half when rotation is enabled
pub fn get_rotation_half_size(&self) -> usize {
if self.rotation_enabled {
(self.instances.len() + 1) / 2
} else {
self.instances.len()
}
}
}
/// Represents a single instance of chromedriver process, optionally bound to a VPN.