From c51b36c125d2f9444a11f30d90763abce1a35bf2 Mon Sep 17 00:00:00 2001 From: donpat1to Date: Thu, 18 Dec 2025 14:01:51 +0100 Subject: [PATCH] added session detection with requests per task --- src/scraper/webdriver.rs | 103 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/src/scraper/webdriver.rs b/src/scraper/webdriver.rs index 0c1093e..5a744b4 100644 --- a/src/scraper/webdriver.rs +++ b/src/scraper/webdriver.rs @@ -276,6 +276,11 @@ pub struct ChromeInstance { task_count: usize, max_tasks_per_instance: usize, proxy_url: Option, + + // NEU: Session-Management + current_session: Arc>>, + session_request_count: Arc>, + max_requests_per_session: usize, // z.B. 25 } impl ChromeInstance { @@ -289,17 +294,110 @@ impl ChromeInstance { task_count: 0, max_tasks_per_instance, proxy_url, + // NEU + current_session: Arc::new(Mutex::new(None)), + session_request_count: Arc::new(Mutex::new(0)), + max_requests_per_session: 25, // Konfigurierbar machen! }) } - pub async fn new_session(&self) -> Result { + pub async fn get_or_renew_session(&self) -> Result { + let mut session_opt = self.current_session.lock().await; + let mut request_count = self.session_request_count.lock().await; + + // Session erneuern wenn: + // 1. Keine Session vorhanden + // 2. Request-Limit erreicht + let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session; + + if needs_renewal { + // Alte Session schließen + if let Some(old_session) = session_opt.take() { + crate::util::logger::log_info("Closing old session").await; + let _ = old_session.close().await; + // Kurze Pause zwischen Sessions + sleep(Duration::from_millis(rand::rng().random_range(500..1000))).await; + } + + // Neue Session mit frischem User-Agent erstellen + crate::util::logger::log_info(&format!( + "Creating new session (requests in last session: {})", + *request_count + )).await; + + let new_session = self.create_fresh_session().await?; + *session_opt = Some(new_session.clone()); + *request_count = 0; + + Ok(new_session) + } else { + // Existierende Session verwenden + *request_count += 1; + Ok(session_opt.as_ref().unwrap().clone()) + } + } + + async fn create_fresh_session(&self) -> Result { + // WICHTIG: User-Agent hier wählen, nicht in chrome_args()! + let user_agent = Self::chrome_user_agent(); + let capabilities = self.chrome_args_with_ua(user_agent); + ClientBuilder::native() - .capabilities(self.chrome_args()) + .capabilities(capabilities) .connect(&self.base_url) .await .context("Failed to connect to ChromeDriver") } + fn chrome_args_with_ua(&self, user_agent: &str) -> Map { + let mut args = vec![ + "--headless=new".to_string(), + "--disable-gpu".to_string(), + "--no-sandbox".to_string(), + "--disable-dev-shm-usage".to_string(), + "--disable-infobars".to_string(), + "--disable-extensions".to_string(), + "--disable-popup-blocking".to_string(), + "--disable-notifications".to_string(), + "--disable-autofill".to_string(), + "--disable-sync".to_string(), + "--disable-default-apps".to_string(), + "--disable-translate".to_string(), + "--disable-blink-features=AutomationControlled".to_string(), + // User-Agent als Parameter! + format!("--user-agent={}", user_agent), + ]; + + if let Some(ref proxy) = self.proxy_url { + args.push(format!("--proxy-server={}", proxy)); + } + + let caps = serde_json::json!({ + "goog:chromeOptions": { + "args": args, + "excludeSwitches": ["enable-logging", "enable-automation"], + "prefs": { + "profile.default_content_setting_values.notifications": 2 + } + } + }); + caps.as_object().cloned().unwrap() + } + + pub async fn new_session(&self) -> Result { + // Für Backward-Compatibility, aber sollte get_or_renew_session() nutzen! + self.create_fresh_session().await + } + + pub fn reset_task_count(&mut self) { + self.task_count = 0; + } + + pub async fn get_session_stats(&self) -> (usize, usize) { + let request_count = *self.session_request_count.lock().await; + (self.task_count, request_count) + } + pub fn increment_task_count(&mut self) { self.task_count += 1; } @@ -402,7 +500,6 @@ impl ChromeInstance { caps.as_object().cloned().unwrap() } - pub fn chrome_user_agent() -> &'static str { static UAS: &[&str] = &[ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",