added pool rotation to chromedriver pool
This commit is contained in:
38
.env.example
38
.env.example
@@ -3,46 +3,38 @@
|
|||||||
# This file configures the behavior of the WebScraper application
|
# This file configures the behavior of the WebScraper application
|
||||||
# Copy to .env and adjust values as needed
|
# Copy to .env and adjust values as needed
|
||||||
|
|
||||||
# ===== ECONOMIC DATA =====
|
OPENFIGI_API_KEY=
|
||||||
# Start date for economic event scraping
|
|
||||||
|
# Economic calendar start (usually the earliest available on finanzen.net)
|
||||||
ECONOMIC_START_DATE=2007-02-13
|
ECONOMIC_START_DATE=2007-02-13
|
||||||
|
|
||||||
# How far into the future to look ahead for economic events (in months)
|
# Corporate earnings & price history start
|
||||||
ECONOMIC_LOOKAHEAD_MONTHS=3
|
|
||||||
|
|
||||||
# ===== CORPORATE DATA =====
|
|
||||||
# Start date for corporate earnings/data scraping
|
|
||||||
CORPORATE_START_DATE=2010-01-01
|
CORPORATE_START_DATE=2010-01-01
|
||||||
|
|
||||||
# ===== PERFORMANCE & CONCURRENCY =====
|
# How far into the future we scrape economic events (in months)
|
||||||
# Maximum number of parallel ChromeDriver instances
|
ECONOMIC_LOOKAHEAD_MONTHS=3
|
||||||
# Higher = more concurrent tasks, but higher resource usage
|
|
||||||
MAX_PARALLEL_INSTANCES=3
|
|
||||||
|
|
||||||
# Maximum tasks per ChromeDriver instance before recycling
|
# Maximum number of parallel scraping tasks (default: 10)
|
||||||
# 0 = unlimited (instance lives for entire application runtime)
|
MAX_PARALLEL_TASKS=10
|
||||||
MAX_TASKS_PER_INSTANCE=0
|
|
||||||
|
|
||||||
# ===== VPN ROTATION (ProtonVPN Integration) =====
|
# ===== VPN ROTATION (ProtonVPN Integration) =====
|
||||||
# Enable automatic VPN rotation between sessions?
|
# Enable automatic VPN rotation between sessions?
|
||||||
# If false, all traffic goes through system without VPN tunneling
|
# If false, all traffic goes through system without VPN tunneling
|
||||||
ENABLE_VPN_ROTATION=false
|
ENABLE_VPN_ROTATION=true
|
||||||
|
|
||||||
# Comma-separated list of ProtonVPN servers to rotate through
|
|
||||||
# Examples:
|
|
||||||
# "US-Free#1,US-Free#2,UK-Free#1"
|
|
||||||
# "US,UK,JP,DE,NL"
|
|
||||||
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
|
||||||
VPN_SERVERS=
|
|
||||||
|
|
||||||
# Number of tasks per VPN session before rotating to new server/IP
|
# Number of tasks per VPN session before rotating to new server/IP
|
||||||
# 0 = rotate between economic and corporate phases (one phase = one IP)
|
# 0 = rotate between economic and corporate phases (one phase = one IP)
|
||||||
# 5 = rotate every 5 tasks
|
# 5 = rotate every 5 tasks
|
||||||
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
||||||
TASKS_PER_VPN_SESSION=0
|
TASKS_PER_VPN_SESSION=50
|
||||||
|
|
||||||
# ===== LOGGING =====
|
# ===== LOGGING =====
|
||||||
# Set via RUST_LOG environment variable:
|
# Set via RUST_LOG environment variable:
|
||||||
# RUST_LOG=info cargo run
|
# RUST_LOG=info cargo run
|
||||||
# RUST_LOG=debug cargo run
|
# RUST_LOG=debug cargo run
|
||||||
# Leave empty or unset for default logging level
|
# Leave empty or unset for default logging level
|
||||||
|
|
||||||
|
|
||||||
|
MAX_REQUESTS_PER_SESSION=25
|
||||||
|
MIN_REQUEST_INTERVAL_MS=300
|
||||||
|
MAX_RETRY_ATTEMPTS=3
|
||||||
133
src/config.rs
133
src/config.rs
@@ -1,32 +1,49 @@
|
|||||||
|
// src/config.rs - FIXED VERSION
|
||||||
|
|
||||||
|
use std::sync::{Arc, atomic::{AtomicUsize, Ordering}};
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::{self};
|
use chrono::{self};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct Config {
|
pub struct Config {
|
||||||
// Economic calendar start (usually the earliest available on finanzen.net)
|
pub economic_start_date: String,
|
||||||
pub economic_start_date: String, // e.g. "2007-02-13"
|
pub corporate_start_date: String,
|
||||||
// Corporate earnings & price history start
|
pub economic_lookahead_months: u32,
|
||||||
pub corporate_start_date: String, // e.g. "2000-01-01" or "2010-01-01"
|
|
||||||
// How far into the future we scrape economic events
|
|
||||||
pub economic_lookahead_months: u32, // default: 3
|
|
||||||
/// Maximum number of parallel scraping tasks (default: 10).
|
|
||||||
/// This limits concurrency to protect system load and prevent website spamming.
|
|
||||||
#[serde(default = "default_max_parallel_instances")]
|
#[serde(default = "default_max_parallel_instances")]
|
||||||
pub max_parallel_instances: usize,
|
pub max_parallel_instances: usize,
|
||||||
|
|
||||||
pub max_tasks_per_instance: usize,
|
pub max_tasks_per_instance: usize,
|
||||||
|
|
||||||
/// VPN rotation configuration
|
|
||||||
/// If set to "true", enables automatic VPN rotation between sessions
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub enable_vpn_rotation: bool,
|
pub enable_vpn_rotation: bool,
|
||||||
|
|
||||||
|
// IMPROVEMENT: Reduzierte Defaults für weniger aggressive Scraping
|
||||||
|
#[serde(default = "default_max_requests_per_session")]
|
||||||
|
pub max_requests_per_session: usize,
|
||||||
|
|
||||||
|
#[serde(default = "default_min_request_interval_ms")]
|
||||||
|
pub min_request_interval_ms: u64,
|
||||||
|
|
||||||
|
#[serde(default = "default_max_retry_attempts")]
|
||||||
|
pub max_retry_attempts: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_max_parallel_instances() -> usize {
|
fn default_max_parallel_instances() -> usize {
|
||||||
10
|
4 // Reduziert von 10 auf 4
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn default_max_requests_per_session() -> usize {
|
||||||
|
10 // Reduziert von 25 auf 10
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_min_request_interval_ms() -> u64 {
|
||||||
|
1200 // Erhöht von 300 auf 1200
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_retry_attempts() -> u32 { 3 }
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -35,26 +52,71 @@ impl Default for Config {
|
|||||||
economic_lookahead_months: 3,
|
economic_lookahead_months: 3,
|
||||||
max_parallel_instances: default_max_parallel_instances(),
|
max_parallel_instances: default_max_parallel_instances(),
|
||||||
max_tasks_per_instance: 0,
|
max_tasks_per_instance: 0,
|
||||||
|
max_requests_per_session: default_max_requests_per_session(),
|
||||||
|
min_request_interval_ms: default_min_request_interval_ms(),
|
||||||
|
max_retry_attempts: default_max_retry_attempts(),
|
||||||
enable_vpn_rotation: false,
|
enable_vpn_rotation: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct PoolMetrics {
|
||||||
|
pub total_requests: Arc<AtomicUsize>,
|
||||||
|
pub successful_requests: Arc<AtomicUsize>,
|
||||||
|
pub failed_requests: Arc<AtomicUsize>,
|
||||||
|
pub session_renewals: Arc<AtomicUsize>,
|
||||||
|
pub rotation_events: Arc<AtomicUsize>,
|
||||||
|
pub retries: Arc<AtomicUsize>,
|
||||||
|
|
||||||
|
// IMPROVEMENT: Neue Metriken für besseres Monitoring
|
||||||
|
pub navigation_timeouts: Arc<AtomicUsize>,
|
||||||
|
pub bot_detection_hits: Arc<AtomicUsize>,
|
||||||
|
pub proxy_failures: Arc<AtomicUsize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PoolMetrics {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
total_requests: Arc::new(AtomicUsize::new(0)),
|
||||||
|
successful_requests: Arc::new(AtomicUsize::new(0)),
|
||||||
|
failed_requests: Arc::new(AtomicUsize::new(0)),
|
||||||
|
session_renewals: Arc::new(AtomicUsize::new(0)),
|
||||||
|
rotation_events: Arc::new(AtomicUsize::new(0)),
|
||||||
|
retries: Arc::new(AtomicUsize::new(0)),
|
||||||
|
navigation_timeouts: Arc::new(AtomicUsize::new(0)),
|
||||||
|
bot_detection_hits: Arc::new(AtomicUsize::new(0)),
|
||||||
|
proxy_failures: Arc::new(AtomicUsize::new(0)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_stats(&self) {
|
||||||
|
let total = self.total_requests.load(Ordering::Relaxed);
|
||||||
|
let success = self.successful_requests.load(Ordering::Relaxed);
|
||||||
|
// FIX: Prefix unused variable with underscore
|
||||||
|
let _failed = self.failed_requests.load(Ordering::Relaxed);
|
||||||
|
let renewals = self.session_renewals.load(Ordering::Relaxed);
|
||||||
|
let rotations = self.rotation_events.load(Ordering::Relaxed);
|
||||||
|
let retries = self.retries.load(Ordering::Relaxed);
|
||||||
|
let timeouts = self.navigation_timeouts.load(Ordering::Relaxed);
|
||||||
|
let bot_hits = self.bot_detection_hits.load(Ordering::Relaxed);
|
||||||
|
let proxy_fails = self.proxy_failures.load(Ordering::Relaxed);
|
||||||
|
|
||||||
|
let success_rate = if total > 0 {
|
||||||
|
(success as f64 / total as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Pool Metrics: {} total requests, {:.1}% success rate, {} renewals, {} rotations, {} retries, {} timeouts, {} bot detections, {} proxy failures",
|
||||||
|
total, success_rate, renewals, rotations, retries, timeouts, bot_hits, proxy_fails
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
/// Loads the configuration from environment variables using dotenvy.
|
/// Loads configuration from environment variables using dotenvy.
|
||||||
///
|
|
||||||
/// This function loads a `.env` file if present (via `dotenvy::dotenv()`),
|
|
||||||
/// then retrieves each configuration value from environment variables.
|
|
||||||
/// If a variable is missing, it falls back to the default value.
|
|
||||||
/// Variable names are uppercase with underscores (e.g., ECONOMIC_START_DATE).
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// The loaded Config on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if parsing fails (e.g., invalid integer for lookahead months).
|
|
||||||
pub fn load() -> Result<Self> {
|
pub fn load() -> Result<Self> {
|
||||||
// Load .env file if it exists; ignore if not found (dotenvy::dotenv returns Ok if no file)
|
|
||||||
let _ = dotenvy::dotenv().context("Failed to load .env file (optional)")?;
|
let _ = dotenvy::dotenv().context("Failed to load .env file (optional)")?;
|
||||||
|
|
||||||
let economic_start_date = dotenvy::var("ECONOMIC_START_DATE")
|
let economic_start_date = dotenvy::var("ECONOMIC_START_DATE")
|
||||||
@@ -68,13 +130,14 @@ impl Config {
|
|||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
||||||
|
|
||||||
|
// IMPROVEMENT: Reduzierte Defaults
|
||||||
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
|
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
|
||||||
.unwrap_or_else(|_| "10".to_string())
|
.unwrap_or_else(|_| "4".to_string()) // Geändert von 10
|
||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
|
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
|
||||||
|
|
||||||
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
|
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
|
||||||
.unwrap_or_else(|_| "0".to_string())
|
.unwrap_or_else(|_| "5".to_string()) // Geändert von 0
|
||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
|
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
|
||||||
|
|
||||||
@@ -83,6 +146,21 @@ impl Config {
|
|||||||
.parse::<bool>()
|
.parse::<bool>()
|
||||||
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
||||||
|
|
||||||
|
let max_requests_per_session: usize = dotenvy::var("MAX_REQUESTS_PER_SESSION")
|
||||||
|
.unwrap_or_else(|_| "10".to_string()) // Geändert von 25
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_REQUESTS_PER_SESSION as usize")?;
|
||||||
|
|
||||||
|
let min_request_interval_ms: u64 = dotenvy::var("MIN_REQUEST_INTERVAL_MS")
|
||||||
|
.unwrap_or_else(|_| "1200".to_string()) // Geändert von 300
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MIN_REQUEST_INTERVAL_MS as u64")?;
|
||||||
|
|
||||||
|
let max_retry_attempts: u32 = dotenvy::var("MAX_RETRY_ATTEMPTS")
|
||||||
|
.unwrap_or_else(|_| "3".to_string())
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_RETRY_ATTEMPTS as u32")?;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
economic_start_date,
|
economic_start_date,
|
||||||
corporate_start_date,
|
corporate_start_date,
|
||||||
@@ -90,6 +168,9 @@ impl Config {
|
|||||||
max_parallel_instances,
|
max_parallel_instances,
|
||||||
max_tasks_per_instance,
|
max_tasks_per_instance,
|
||||||
enable_vpn_rotation,
|
enable_vpn_rotation,
|
||||||
|
max_requests_per_session,
|
||||||
|
min_request_interval_ms,
|
||||||
|
max_retry_attempts,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,13 +12,18 @@ use crate::util::directories::DataPaths;
|
|||||||
use crate::util::logger;
|
use crate::util::logger;
|
||||||
use crate::scraper::webdriver::ChromeDriverPool;
|
use crate::scraper::webdriver::ChromeDriverPool;
|
||||||
|
|
||||||
|
use rand::Rng;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio::fs::OpenOptions;
|
use tokio::fs::OpenOptions;
|
||||||
|
use tokio::time::sleep;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::time::Duration;
|
||||||
use futures::stream::{FuturesUnordered, StreamExt};
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
|
||||||
|
|
||||||
/// Represents a write command to be serialized through the log writer
|
/// Represents a write command to be serialized through the log writer
|
||||||
enum LogCommand {
|
enum LogCommand {
|
||||||
@@ -413,6 +418,37 @@ pub async fn build_companies_jsonl_streaming_parallel(
|
|||||||
Ok(final_count)
|
Ok(final_count)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn scrape_with_retry(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
isin: &str,
|
||||||
|
max_retries: u32,
|
||||||
|
) -> Result<Option<YahooCompanyDetails>> {
|
||||||
|
let mut retries = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match scrape_company_details_by_isin(pool, isin).await {
|
||||||
|
Ok(result) => return Ok(result),
|
||||||
|
Err(e) => {
|
||||||
|
if retries >= max_retries {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let backoff_ms = 1000 * 2u64.pow(retries); // 1s, 2s, 4s, 8s
|
||||||
|
let jitter_ms = rand::rng().random_range(0..500); // +0-500ms Jitter
|
||||||
|
let total_delay = backoff_ms + jitter_ms;
|
||||||
|
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Retry {}/{} for ISIN {} after {}ms: {}",
|
||||||
|
retries + 1, max_retries, isin, total_delay, e
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(total_delay)).await;
|
||||||
|
retries += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Process a single company: fetch Yahoo data for its ISINs
|
/// Process a single company: fetch Yahoo data for its ISINs
|
||||||
async fn process_single_company(
|
async fn process_single_company(
|
||||||
name: String,
|
name: String,
|
||||||
@@ -469,8 +505,7 @@ async fn process_single_company(
|
|||||||
|
|
||||||
if !has_yahoo_ticker && !shutdown_flag.load(Ordering::SeqCst) {
|
if !has_yahoo_ticker && !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
||||||
|
match scrape_with_retry(pool, &isin, 3).await {
|
||||||
match scrape_company_details_by_isin(pool, &isin).await {
|
|
||||||
Ok(Some(details)) => {
|
Ok(Some(details)) => {
|
||||||
logger::log_info(&format!("✓ Found Yahoo ticker {} for ISIN {}", details.ticker, isin)).await;
|
logger::log_info(&format!("✓ Found Yahoo ticker {} for ISIN {}", details.ticker, isin)).await;
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use super::{types::*, helpers::*};
|
|||||||
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
||||||
use event_backtest_engine::logger;
|
use event_backtest_engine::logger;
|
||||||
use fantoccini::{Client, Locator};
|
use fantoccini::{Client, Locator};
|
||||||
|
use rand::Rng;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
||||||
use std::{sync::Arc};
|
use std::{sync::Arc};
|
||||||
@@ -73,9 +74,16 @@ pub async fn scrape_company_details_by_isin(
|
|||||||
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
|
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
|
||||||
let isin = isin.clone();
|
let isin = isin.clone();
|
||||||
Box::pin(async move {
|
Box::pin(async move {
|
||||||
sleep(TokioDuration::from_millis(1000)).await;
|
// Random Delay between 800-1500ms
|
||||||
|
let delay = rand::rng().random_range(800..1500);
|
||||||
|
sleep(TokioDuration::from_millis(delay)).await;
|
||||||
|
|
||||||
reject_yahoo_cookies(&client).await?;
|
reject_yahoo_cookies(&client).await?;
|
||||||
sleep(TokioDuration::from_millis(1000)).await;
|
|
||||||
|
// Random Delay
|
||||||
|
let delay = rand::rng().random_range(800..1500);
|
||||||
|
sleep(TokioDuration::from_millis(delay)).await;
|
||||||
|
|
||||||
extract_company_details(&client, &isin).await
|
extract_company_details(&client, &isin).await
|
||||||
})
|
})
|
||||||
}).await
|
}).await
|
||||||
|
|||||||
@@ -2,13 +2,14 @@
|
|||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use fantoccini::{Client, ClientBuilder};
|
use fantoccini::{Client, ClientBuilder};
|
||||||
use rand::seq::{IndexedRandom, SliceRandom};
|
use rand::seq::{IndexedRandom};
|
||||||
use rand::rngs::ThreadRng;
|
use rand::rngs::ThreadRng;
|
||||||
use rand::Rng; // for the RNG trait
|
use rand::Rng; // for the RNG trait
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::process::Stdio;
|
use std::process::Stdio;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::time::Instant;
|
||||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||||
use tokio::process::{Child, Command};
|
use tokio::process::{Child, Command};
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
@@ -26,6 +27,9 @@ pub struct ChromeDriverPool {
|
|||||||
rotation_enabled: bool,
|
rotation_enabled: bool,
|
||||||
/// Index for round-robin instance selection (when rotation is enabled)
|
/// Index for round-robin instance selection (when rotation is enabled)
|
||||||
next_instance: Arc<Mutex<usize>>,
|
next_instance: Arc<Mutex<usize>>,
|
||||||
|
|
||||||
|
last_request_time: Arc<Mutex<Instant>>,
|
||||||
|
min_request_interval_ms: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeDriverPool {
|
impl ChromeDriverPool {
|
||||||
@@ -94,11 +98,12 @@ impl ChromeDriverPool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i in 0..actual_pool_size {
|
for i in 0..actual_pool_size {
|
||||||
let proxy_url = proxy_pool
|
// Pass the entire proxy_pool and the index
|
||||||
.as_ref()
|
let instance = ChromeInstance::new(
|
||||||
.map(|pp| pp.get_proxy_url(i));
|
proxy_pool.clone(), // Clone the Arc
|
||||||
|
i, // This instance's proxy index
|
||||||
let instance = ChromeInstance::new(proxy_url, max_tasks_per_instance).await?;
|
max_tasks_per_instance
|
||||||
|
).await?;
|
||||||
|
|
||||||
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
||||||
instances.push(Arc::new(Mutex::new(instance)));
|
instances.push(Arc::new(Mutex::new(instance)));
|
||||||
@@ -110,18 +115,11 @@ impl ChromeDriverPool {
|
|||||||
proxy_pool,
|
proxy_pool,
|
||||||
rotation_enabled,
|
rotation_enabled,
|
||||||
next_instance: Arc::new(Mutex::new(0)),
|
next_instance: Arc::new(Mutex::new(0)),
|
||||||
|
last_request_time: Arc::new(Mutex::new(Instant::now())),
|
||||||
|
min_request_interval_ms: 300,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Execute a scraping task using an available instance from the pool.
|
|
||||||
///
|
|
||||||
/// When rotation is enabled:
|
|
||||||
/// - Uses only half of the instances at a time
|
|
||||||
/// - Rotates to the other half when an instance reaches its task limit
|
|
||||||
/// - Cycles through instances in round-robin fashion within the active half
|
|
||||||
///
|
|
||||||
/// When rotation is disabled:
|
|
||||||
/// - Uses all instances with random selection
|
|
||||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||||
where
|
where
|
||||||
T: Send + 'static,
|
T: Send + 'static,
|
||||||
@@ -130,108 +128,113 @@ impl ChromeDriverPool {
|
|||||||
{
|
{
|
||||||
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
||||||
|
|
||||||
let index = if self.rotation_enabled {
|
{
|
||||||
// Rotation mode: use only half of instances at a time
|
let mut last_time = self.last_request_time.lock().await;
|
||||||
let total_instances = self.instances.len();
|
let elapsed = last_time.elapsed().as_millis() as u64;
|
||||||
let half_size = (total_instances + 1) / 2; // Round up for odd numbers
|
|
||||||
|
|
||||||
let mut next_idx = self.next_instance.lock().await;
|
if elapsed < self.min_request_interval_ms {
|
||||||
let base_idx = *next_idx;
|
let wait_ms = self.min_request_interval_ms - elapsed;
|
||||||
let mut selected_idx = base_idx;
|
drop(last_time); // Lock vor Sleep freigeben!
|
||||||
let mut found_in_current_half = false;
|
|
||||||
|
|
||||||
// Try to find an available instance in the current half
|
|
||||||
for offset in 0..half_size {
|
|
||||||
let candidate_idx = (base_idx + offset) % half_size;
|
|
||||||
|
|
||||||
// Check if this instance has reached its task limit
|
sleep(Duration::from_millis(wait_ms)).await;
|
||||||
let instance = &self.instances[candidate_idx];
|
|
||||||
let guard = instance.lock().await;
|
|
||||||
|
|
||||||
if guard.max_tasks_per_instance == 0 ||
|
let mut last_time = self.last_request_time.lock().await;
|
||||||
guard.task_count < guard.max_tasks_per_instance {
|
*last_time = Instant::now();
|
||||||
// This instance is available
|
} else {
|
||||||
*next_idx = (candidate_idx + 1) % half_size;
|
*last_time = Instant::now();
|
||||||
selected_idx = candidate_idx;
|
|
||||||
found_in_current_half = true;
|
|
||||||
drop(guard);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
drop(guard);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !found_in_current_half {
|
|
||||||
// All instances in current half are at limit, switch to other half
|
|
||||||
crate::util::logger::log_info(
|
|
||||||
"Current half saturated, rotating to other half of instances"
|
|
||||||
).await;
|
|
||||||
|
|
||||||
let other_half_start = half_size;
|
|
||||||
let other_half_size = total_instances - half_size;
|
|
||||||
|
|
||||||
// Find available instance in other half
|
|
||||||
let mut found_in_other_half = false;
|
|
||||||
for offset in 0..other_half_size {
|
|
||||||
let candidate_idx = other_half_start + offset;
|
|
||||||
|
|
||||||
let instance = &self.instances[candidate_idx];
|
|
||||||
let guard = instance.lock().await;
|
|
||||||
|
|
||||||
if guard.max_tasks_per_instance == 0 ||
|
|
||||||
guard.task_count < guard.max_tasks_per_instance {
|
|
||||||
// Switch to this half for future requests
|
|
||||||
*next_idx = offset;
|
|
||||||
selected_idx = candidate_idx;
|
|
||||||
found_in_other_half = true;
|
|
||||||
drop(guard);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
drop(guard);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !found_in_other_half {
|
|
||||||
// All instances saturated - use round-robin anyway
|
|
||||||
selected_idx = *next_idx % total_instances;
|
|
||||||
*next_idx = (*next_idx + 1) % total_instances;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(next_idx);
|
|
||||||
selected_idx
|
|
||||||
} else {
|
|
||||||
// Non-rotation mode: random selection as before
|
|
||||||
rand::random_range(..self.instances.len())
|
|
||||||
};
|
|
||||||
|
|
||||||
let instance = self.instances[index].clone();
|
|
||||||
let mut guard = instance.lock().await;
|
|
||||||
|
|
||||||
guard.increment_task_count();
|
|
||||||
|
|
||||||
if guard.max_tasks_per_instance > 0 {
|
|
||||||
crate::util::logger::log_info(&format!(
|
|
||||||
"Instance {} task count: {}/{}",
|
|
||||||
index,
|
|
||||||
guard.get_task_count(),
|
|
||||||
guard.max_tasks_per_instance
|
|
||||||
))
|
|
||||||
.await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let client = guard.new_session().await?;
|
// Index-Auswahl (vereinfacht, siehe unten für vollständige Rotation)
|
||||||
|
let index = if self.rotation_enabled {
|
||||||
|
self.get_rotated_index().await?
|
||||||
|
} else {
|
||||||
|
rand::rng().random_range(0..self.instances.len())
|
||||||
|
};
|
||||||
|
|
||||||
drop(guard); // release lock early
|
let instance = &self.instances[index];
|
||||||
|
let mut guard = instance.lock().await;
|
||||||
|
|
||||||
|
// NEU: Session mit automatischer Erneuerung holen!
|
||||||
|
let client = guard.get_or_renew_session().await?;
|
||||||
|
|
||||||
|
guard.increment_task_count();
|
||||||
|
let (task_count, session_requests) = guard.get_session_stats().await;
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Instance {} executing task (tasks: {}/{}, session requests: {})",
|
||||||
|
index, task_count, guard.max_tasks_per_instance, session_requests
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
drop(guard); // Lock freigeben vor Navigation
|
||||||
|
|
||||||
crate::util::logger::log_info(&format!("Scraping {} ...", url)).await;
|
// Navigation mit Timeout
|
||||||
client.goto(&url).await.context("Navigation failed")?;
|
let navigation_result = timeout(
|
||||||
|
Duration::from_secs(60),
|
||||||
|
client.goto(&url)
|
||||||
|
).await;
|
||||||
|
|
||||||
|
match navigation_result {
|
||||||
|
Ok(Ok(_)) => {
|
||||||
|
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
|
||||||
|
|
||||||
|
// Parse-Funktion ausführen
|
||||||
|
parse(client).await
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await;
|
||||||
|
Err(anyhow!("Navigation failed: {}", e))
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
crate::util::logger::log_error("Navigation timeout (60s)").await;
|
||||||
|
Err(anyhow!("Navigation timeout"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let result = timeout(Duration::from_secs(90), parse(client))
|
async fn get_rotated_index(&self) -> Result<usize> {
|
||||||
.await
|
let total = self.instances.len();
|
||||||
.context("Parse timeout")??;
|
let half_size = total / 2;
|
||||||
|
|
||||||
Ok(result)
|
if half_size == 0 {
|
||||||
|
return Ok(0); // Pool zu klein für Rotation
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut next_idx = self.next_instance.lock().await;
|
||||||
|
let current_half_start = if *next_idx < half_size { 0 } else { half_size };
|
||||||
|
let current_half_end = if *next_idx < half_size { half_size } else { total };
|
||||||
|
|
||||||
|
// Suche verfügbare Instanz in aktueller Hälfte
|
||||||
|
for offset in 0..(current_half_end - current_half_start) {
|
||||||
|
let candidate_idx = current_half_start + ((*next_idx + offset) % half_size);
|
||||||
|
|
||||||
|
let instance = &self.instances[candidate_idx];
|
||||||
|
let guard = instance.lock().await;
|
||||||
|
|
||||||
|
if guard.max_tasks_per_instance == 0 ||
|
||||||
|
guard.task_count < guard.max_tasks_per_instance {
|
||||||
|
*next_idx = (candidate_idx + 1) % total;
|
||||||
|
drop(guard);
|
||||||
|
return Ok(candidate_idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aktuelle Hälfte voll → Zur anderen wechseln
|
||||||
|
crate::util::logger::log_info("Current half saturated, rotating to other half").await;
|
||||||
|
|
||||||
|
let new_half_start = if current_half_start == 0 { half_size } else { 0 };
|
||||||
|
let new_half_end = if current_half_start == 0 { total } else { half_size };
|
||||||
|
|
||||||
|
// Alte Hälfte zurücksetzen (für nächste Rotation)
|
||||||
|
for i in current_half_start..current_half_end {
|
||||||
|
let mut instance = self.instances[i].lock().await;
|
||||||
|
instance.reset_task_count();
|
||||||
|
}
|
||||||
|
|
||||||
|
*next_idx = new_half_start;
|
||||||
|
drop(next_idx);
|
||||||
|
|
||||||
|
Ok(new_half_start)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
||||||
@@ -277,16 +280,24 @@ pub struct ChromeInstance {
|
|||||||
max_tasks_per_instance: usize,
|
max_tasks_per_instance: usize,
|
||||||
proxy_url: Option<String>,
|
proxy_url: Option<String>,
|
||||||
|
|
||||||
// NEU: Session-Management
|
current_session: Arc<Mutex<Option<Client>>>, // Current active session
|
||||||
current_session: Arc<Mutex<Option<Client>>>,
|
|
||||||
session_request_count: Arc<Mutex<usize>>,
|
session_request_count: Arc<Mutex<usize>>,
|
||||||
max_requests_per_session: usize, // z.B. 25
|
max_requests_per_session: usize, // z.B. 25
|
||||||
|
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool
|
||||||
|
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeInstance {
|
impl ChromeInstance {
|
||||||
pub async fn new(proxy_url: Option<String>, max_tasks_per_instance: usize) -> Result<Self> {
|
pub async fn new(
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
|
initial_proxy_index: usize,
|
||||||
|
max_tasks_per_instance: usize) -> Result<Self> {
|
||||||
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
||||||
|
|
||||||
|
// Get proxy URL if proxy pool is provided
|
||||||
|
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(initial_proxy_index));
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
base_url,
|
base_url,
|
||||||
process,
|
process,
|
||||||
@@ -294,10 +305,13 @@ impl ChromeInstance {
|
|||||||
task_count: 0,
|
task_count: 0,
|
||||||
max_tasks_per_instance,
|
max_tasks_per_instance,
|
||||||
proxy_url,
|
proxy_url,
|
||||||
// NEU
|
|
||||||
current_session: Arc::new(Mutex::new(None)),
|
current_session: Arc::new(Mutex::new(None)),
|
||||||
session_request_count: Arc::new(Mutex::new(0)),
|
session_request_count: Arc::new(Mutex::new(0)),
|
||||||
max_requests_per_session: 25, // Konfigurierbar machen!
|
max_requests_per_session: 25, // Konfigurierbar machen!
|
||||||
|
|
||||||
|
proxy_pool,
|
||||||
|
current_proxy_index: Arc::new(Mutex::new(initial_proxy_index)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -338,9 +352,24 @@ impl ChromeInstance {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn create_fresh_session(&self) -> Result<Client> {
|
async fn create_fresh_session(&self) -> Result<Client> {
|
||||||
// WICHTIG: User-Agent hier wählen, nicht in chrome_args()!
|
// Hole aktuellen Proxy-URL ohne self zu mutieren
|
||||||
|
let proxy_url = if let Some(ref pool) = self.proxy_pool {
|
||||||
|
let mut proxy_idx = self.current_proxy_index.lock().await;
|
||||||
|
*proxy_idx = (*proxy_idx + 1) % pool.num_proxies();
|
||||||
|
let url = pool.get_proxy_url(*proxy_idx);
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Using proxy {} for new session",
|
||||||
|
*proxy_idx
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
Some(url)
|
||||||
|
} else {
|
||||||
|
self.proxy_url.clone()
|
||||||
|
};
|
||||||
|
|
||||||
let user_agent = Self::chrome_user_agent();
|
let user_agent = Self::chrome_user_agent();
|
||||||
let capabilities = self.chrome_args_with_ua(user_agent);
|
let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url);
|
||||||
|
|
||||||
ClientBuilder::native()
|
ClientBuilder::native()
|
||||||
.capabilities(capabilities)
|
.capabilities(capabilities)
|
||||||
@@ -349,7 +378,7 @@ impl ChromeInstance {
|
|||||||
.context("Failed to connect to ChromeDriver")
|
.context("Failed to connect to ChromeDriver")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn chrome_args_with_ua(&self, user_agent: &str) -> Map<String, Value> {
|
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
|
||||||
let mut args = vec![
|
let mut args = vec![
|
||||||
"--headless=new".to_string(),
|
"--headless=new".to_string(),
|
||||||
"--disable-gpu".to_string(),
|
"--disable-gpu".to_string(),
|
||||||
@@ -364,11 +393,10 @@ impl ChromeInstance {
|
|||||||
"--disable-default-apps".to_string(),
|
"--disable-default-apps".to_string(),
|
||||||
"--disable-translate".to_string(),
|
"--disable-translate".to_string(),
|
||||||
"--disable-blink-features=AutomationControlled".to_string(),
|
"--disable-blink-features=AutomationControlled".to_string(),
|
||||||
// User-Agent als Parameter!
|
|
||||||
format!("--user-agent={}", user_agent),
|
format!("--user-agent={}", user_agent),
|
||||||
];
|
];
|
||||||
|
|
||||||
if let Some(ref proxy) = self.proxy_url {
|
if let Some(proxy) = proxy_url {
|
||||||
args.push(format!("--proxy-server={}", proxy));
|
args.push(format!("--proxy-server={}", proxy));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user