added pool rotation to chromedriver pool

This commit is contained in:
2025-12-18 15:59:56 +01:00
parent c51b36c125
commit cd91de253b
5 changed files with 314 additions and 170 deletions

View File

@@ -12,13 +12,18 @@ use crate::util::directories::DataPaths;
use crate::util::logger;
use crate::scraper::webdriver::ChromeDriverPool;
use rand::Rng;
use tokio::sync::mpsc;
use tokio::io::AsyncWriteExt;
use tokio::fs::OpenOptions;
use tokio::time::sleep;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use futures::stream::{FuturesUnordered, StreamExt};
use anyhow::{anyhow, Context, Result};
/// Represents a write command to be serialized through the log writer
enum LogCommand {
@@ -413,6 +418,37 @@ pub async fn build_companies_jsonl_streaming_parallel(
Ok(final_count)
}
async fn scrape_with_retry(
pool: &Arc<ChromeDriverPool>,
isin: &str,
max_retries: u32,
) -> Result<Option<YahooCompanyDetails>> {
let mut retries = 0;
loop {
match scrape_company_details_by_isin(pool, isin).await {
Ok(result) => return Ok(result),
Err(e) => {
if retries >= max_retries {
return Err(e);
}
let backoff_ms = 1000 * 2u64.pow(retries); // 1s, 2s, 4s, 8s
let jitter_ms = rand::rng().random_range(0..500); // +0-500ms Jitter
let total_delay = backoff_ms + jitter_ms;
logger::log_warn(&format!(
"Retry {}/{} for ISIN {} after {}ms: {}",
retries + 1, max_retries, isin, total_delay, e
)).await;
sleep(Duration::from_millis(total_delay)).await;
retries += 1;
}
}
}
}
/// Process a single company: fetch Yahoo data for its ISINs
async fn process_single_company(
name: String,
@@ -469,8 +505,7 @@ async fn process_single_company(
if !has_yahoo_ticker && !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
match scrape_company_details_by_isin(pool, &isin).await {
match scrape_with_retry(pool, &isin, 3).await {
Ok(Some(details)) => {
logger::log_info(&format!("✓ Found Yahoo ticker {} for ISIN {}", details.ticker, isin)).await;

View File

@@ -3,6 +3,7 @@ use super::{types::*, helpers::*};
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
use event_backtest_engine::logger;
use fantoccini::{Client, Locator};
use rand::Rng;
use serde::{Deserialize, Serialize};
use tokio::time::{Duration as TokioDuration, sleep, timeout};
use std::{sync::Arc};
@@ -73,9 +74,16 @@ pub async fn scrape_company_details_by_isin(
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
let isin = isin.clone();
Box::pin(async move {
sleep(TokioDuration::from_millis(1000)).await;
// Random Delay between 800-1500ms
let delay = rand::rng().random_range(800..1500);
sleep(TokioDuration::from_millis(delay)).await;
reject_yahoo_cookies(&client).await?;
sleep(TokioDuration::from_millis(1000)).await;
// Random Delay
let delay = rand::rng().random_range(800..1500);
sleep(TokioDuration::from_millis(delay)).await;
extract_company_details(&client, &isin).await
})
}).await