added pool rotation to chromedriver pool
This commit is contained in:
@@ -12,13 +12,18 @@ use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
use crate::scraper::webdriver::ChromeDriverPool;
|
||||
|
||||
use rand::Rng;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::fs::OpenOptions;
|
||||
use tokio::time::sleep;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::Duration;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
|
||||
|
||||
/// Represents a write command to be serialized through the log writer
|
||||
enum LogCommand {
|
||||
@@ -413,6 +418,37 @@ pub async fn build_companies_jsonl_streaming_parallel(
|
||||
Ok(final_count)
|
||||
}
|
||||
|
||||
async fn scrape_with_retry(
|
||||
pool: &Arc<ChromeDriverPool>,
|
||||
isin: &str,
|
||||
max_retries: u32,
|
||||
) -> Result<Option<YahooCompanyDetails>> {
|
||||
let mut retries = 0;
|
||||
|
||||
loop {
|
||||
match scrape_company_details_by_isin(pool, isin).await {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => {
|
||||
if retries >= max_retries {
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
let backoff_ms = 1000 * 2u64.pow(retries); // 1s, 2s, 4s, 8s
|
||||
let jitter_ms = rand::rng().random_range(0..500); // +0-500ms Jitter
|
||||
let total_delay = backoff_ms + jitter_ms;
|
||||
|
||||
logger::log_warn(&format!(
|
||||
"Retry {}/{} for ISIN {} after {}ms: {}",
|
||||
retries + 1, max_retries, isin, total_delay, e
|
||||
)).await;
|
||||
|
||||
sleep(Duration::from_millis(total_delay)).await;
|
||||
retries += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Process a single company: fetch Yahoo data for its ISINs
|
||||
async fn process_single_company(
|
||||
name: String,
|
||||
@@ -469,8 +505,7 @@ async fn process_single_company(
|
||||
|
||||
if !has_yahoo_ticker && !shutdown_flag.load(Ordering::SeqCst) {
|
||||
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
||||
|
||||
match scrape_company_details_by_isin(pool, &isin).await {
|
||||
match scrape_with_retry(pool, &isin, 3).await {
|
||||
Ok(Some(details)) => {
|
||||
logger::log_info(&format!("✓ Found Yahoo ticker {} for ISIN {}", details.ticker, isin)).await;
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ use super::{types::*, helpers::*};
|
||||
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
||||
use event_backtest_engine::logger;
|
||||
use fantoccini::{Client, Locator};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
||||
use std::{sync::Arc};
|
||||
@@ -73,9 +74,16 @@ pub async fn scrape_company_details_by_isin(
|
||||
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
|
||||
let isin = isin.clone();
|
||||
Box::pin(async move {
|
||||
sleep(TokioDuration::from_millis(1000)).await;
|
||||
// Random Delay between 800-1500ms
|
||||
let delay = rand::rng().random_range(800..1500);
|
||||
sleep(TokioDuration::from_millis(delay)).await;
|
||||
|
||||
reject_yahoo_cookies(&client).await?;
|
||||
sleep(TokioDuration::from_millis(1000)).await;
|
||||
|
||||
// Random Delay
|
||||
let delay = rand::rng().random_range(800..1500);
|
||||
sleep(TokioDuration::from_millis(delay)).await;
|
||||
|
||||
extract_company_details(&client, &isin).await
|
||||
})
|
||||
}).await
|
||||
|
||||
Reference in New Issue
Block a user