added update_rule for incremental change
This commit is contained in:
@@ -19,11 +19,15 @@ pub struct ChromeDriverPool {
|
||||
semaphore: Arc<Semaphore>,
|
||||
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
/// Whether rotation is enabled (uses half of instances at a time)
|
||||
rotation_enabled: bool,
|
||||
/// Index for round-robin instance selection (when rotation is enabled)
|
||||
next_instance: Arc<Mutex<usize>>,
|
||||
}
|
||||
|
||||
impl ChromeDriverPool {
|
||||
/// Creates a new pool without any proxy (direct connection).
|
||||
pub async fn new(pool_size: usize) -> Result<Self> {
|
||||
pub async fn _new(pool_size: usize) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
|
||||
}
|
||||
|
||||
@@ -40,22 +44,53 @@ impl ChromeDriverPool {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
|
||||
}
|
||||
|
||||
/// Full constructor: supports proxy + task limiting.
|
||||
/// Full constructor: supports proxy + task limiting + rotation.
|
||||
///
|
||||
/// When rotation is enabled, only half of the instances are used at once,
|
||||
/// rotating to the other half when task limits are reached.
|
||||
///
|
||||
/// The actual pool_size is constrained by:
|
||||
/// - max_parallel_instances from config (pool_size_limit parameter)
|
||||
/// - Available proxies from proxy_pool (if provided)
|
||||
///
|
||||
/// Uses the minimum of these constraints to determine actual pool size.
|
||||
pub async fn new_with_proxy_and_task_limit(
|
||||
pool_size: usize,
|
||||
pool_size_limit: usize,
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
max_tasks_per_instance: usize,
|
||||
) -> Result<Self> {
|
||||
let mut instances = Vec::with_capacity(pool_size);
|
||||
// Determine actual pool size based on available resources
|
||||
let actual_pool_size = if let Some(ref pp) = proxy_pool {
|
||||
let available_proxies = pp.num_proxies();
|
||||
pool_size_limit.min(available_proxies)
|
||||
} else {
|
||||
pool_size_limit
|
||||
};
|
||||
|
||||
if actual_pool_size == 0 {
|
||||
return Err(anyhow!("Pool size must be at least 1"));
|
||||
}
|
||||
|
||||
// Rotation is enabled when task limiting is active
|
||||
let rotation_enabled = max_tasks_per_instance > 0;
|
||||
|
||||
let mut instances = Vec::with_capacity(actual_pool_size);
|
||||
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Initializing ChromeDriver pool with {} instances{}...",
|
||||
pool_size,
|
||||
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" }
|
||||
"Initializing ChromeDriver pool with {} instances{}{}...",
|
||||
actual_pool_size,
|
||||
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" },
|
||||
if rotation_enabled { " with rotation enabled" } else { "" }
|
||||
))
|
||||
.await;
|
||||
|
||||
for i in 0..pool_size {
|
||||
if rotation_enabled && actual_pool_size < 2 {
|
||||
crate::util::logger::log_warn(
|
||||
"Rotation enabled but pool has < 2 instances - rotation will be limited"
|
||||
).await;
|
||||
}
|
||||
|
||||
for i in 0..actual_pool_size {
|
||||
let proxy_url = proxy_pool
|
||||
.as_ref()
|
||||
.map(|pp| pp.get_proxy_url(i));
|
||||
@@ -68,12 +103,22 @@ impl ChromeDriverPool {
|
||||
|
||||
Ok(Self {
|
||||
instances,
|
||||
semaphore: Arc::new(Semaphore::new(pool_size)),
|
||||
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
|
||||
proxy_pool,
|
||||
rotation_enabled,
|
||||
next_instance: Arc::new(Mutex::new(0)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Execute a scraping task using an available instance from the pool.
|
||||
///
|
||||
/// When rotation is enabled:
|
||||
/// - Uses only half of the instances at a time
|
||||
/// - Rotates to the other half when an instance reaches its task limit
|
||||
/// - Cycles through instances in round-robin fashion within the active half
|
||||
///
|
||||
/// When rotation is disabled:
|
||||
/// - Uses all instances with random selection
|
||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||
where
|
||||
T: Send + 'static,
|
||||
@@ -82,8 +127,81 @@ impl ChromeDriverPool {
|
||||
{
|
||||
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
||||
|
||||
// Round-robin selection
|
||||
let index = rand::random_range(..self.instances.len());
|
||||
let index = if self.rotation_enabled {
|
||||
// Rotation mode: use only half of instances at a time
|
||||
let total_instances = self.instances.len();
|
||||
let half_size = (total_instances + 1) / 2; // Round up for odd numbers
|
||||
|
||||
let mut next_idx = self.next_instance.lock().await;
|
||||
let base_idx = *next_idx;
|
||||
let mut selected_idx = base_idx;
|
||||
let mut found_in_current_half = false;
|
||||
|
||||
// Try to find an available instance in the current half
|
||||
for offset in 0..half_size {
|
||||
let candidate_idx = (base_idx + offset) % half_size;
|
||||
|
||||
// Check if this instance has reached its task limit
|
||||
let instance = &self.instances[candidate_idx];
|
||||
let guard = instance.lock().await;
|
||||
|
||||
if guard.max_tasks_per_instance == 0 ||
|
||||
guard.task_count < guard.max_tasks_per_instance {
|
||||
// This instance is available
|
||||
*next_idx = (candidate_idx + 1) % half_size;
|
||||
selected_idx = candidate_idx;
|
||||
found_in_current_half = true;
|
||||
drop(guard);
|
||||
break;
|
||||
} else {
|
||||
drop(guard);
|
||||
}
|
||||
}
|
||||
|
||||
if !found_in_current_half {
|
||||
// All instances in current half are at limit, switch to other half
|
||||
crate::util::logger::log_info(
|
||||
"Current half saturated, rotating to other half of instances"
|
||||
).await;
|
||||
|
||||
let other_half_start = half_size;
|
||||
let other_half_size = total_instances - half_size;
|
||||
|
||||
// Find available instance in other half
|
||||
let mut found_in_other_half = false;
|
||||
for offset in 0..other_half_size {
|
||||
let candidate_idx = other_half_start + offset;
|
||||
|
||||
let instance = &self.instances[candidate_idx];
|
||||
let guard = instance.lock().await;
|
||||
|
||||
if guard.max_tasks_per_instance == 0 ||
|
||||
guard.task_count < guard.max_tasks_per_instance {
|
||||
// Switch to this half for future requests
|
||||
*next_idx = offset;
|
||||
selected_idx = candidate_idx;
|
||||
found_in_other_half = true;
|
||||
drop(guard);
|
||||
break;
|
||||
} else {
|
||||
drop(guard);
|
||||
}
|
||||
}
|
||||
|
||||
if !found_in_other_half {
|
||||
// All instances saturated - use round-robin anyway
|
||||
selected_idx = *next_idx % total_instances;
|
||||
*next_idx = (*next_idx + 1) % total_instances;
|
||||
}
|
||||
}
|
||||
|
||||
drop(next_idx);
|
||||
selected_idx
|
||||
} else {
|
||||
// Non-rotation mode: random selection as before
|
||||
rand::random_range(..self.instances.len())
|
||||
};
|
||||
|
||||
let instance = self.instances[index].clone();
|
||||
let mut guard = instance.lock().await;
|
||||
|
||||
@@ -91,7 +209,8 @@ impl ChromeDriverPool {
|
||||
|
||||
if guard.max_tasks_per_instance > 0 {
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Instance task count: {}/{}",
|
||||
"Instance {} task count: {}/{}",
|
||||
index,
|
||||
guard.get_task_count(),
|
||||
guard.max_tasks_per_instance
|
||||
))
|
||||
@@ -130,6 +249,20 @@ impl ChromeDriverPool {
|
||||
pub fn get_number_of_instances(&self) -> usize {
|
||||
self.instances.len()
|
||||
}
|
||||
|
||||
/// Returns whether rotation is enabled
|
||||
pub fn is_rotation_enabled(&self) -> bool {
|
||||
self.rotation_enabled
|
||||
}
|
||||
|
||||
/// Returns the size of each half when rotation is enabled
|
||||
pub fn get_rotation_half_size(&self) -> usize {
|
||||
if self.rotation_enabled {
|
||||
(self.instances.len() + 1) / 2
|
||||
} else {
|
||||
self.instances.len()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
|
||||
|
||||
Reference in New Issue
Block a user