// src/corporate/page_validation.rs // // Utilities to ensure page state is correct before extraction use anyhow::{anyhow, Result}; use fantoccini::Client; use tokio::time::{sleep, Duration}; /// Validates that the browser navigated to the expected URL /// /// This prevents extracting data from a stale page when navigation fails silently pub async fn verify_navigation( client: &Client, expected_url_fragment: &str, max_attempts: u32, ) -> Result<()> { for attempt in 1..=max_attempts { let current_url = client.current_url().await?; let current = current_url.as_str(); if current.contains(expected_url_fragment) { crate::util::logger::log_info(&format!( "✓ Navigation verified: {} (attempt {})", current, attempt )).await; return Ok(()); } if attempt < max_attempts { crate::util::logger::log_warn(&format!( "Navigation mismatch (attempt {}): expected '{}', got '{}'. Retrying...", attempt, expected_url_fragment, current )).await; sleep(Duration::from_millis(500)).await; } } let current_url = client.current_url().await?; Err(anyhow!( "Navigation verification failed: expected URL containing '{}', but got '{}'", expected_url_fragment, current_url.as_str() )) } /// Clears browser state by navigating to a blank page /// /// Use this when a navigation fails or times out to ensure clean slate pub async fn clear_browser_state(client: &Client) -> Result<()> { crate::util::logger::log_info("Clearing browser state with about:blank").await; // Navigate to blank page to clear any stale content client.goto("about:blank").await?; // Brief wait to ensure page clears sleep(Duration::from_millis(200)).await; Ok(()) } /// Validates that expected content exists on the page before extraction /// /// This adds an extra safety check that the page actually loaded pub async fn verify_page_content( client: &Client, content_checks: Vec, ) -> Result<()> { for check in content_checks { match check { ContentCheck::ElementExists(selector) => { let exists: bool = client .execute( &format!( "return !!document.querySelector('{}');", selector.replace("'", "\\'") ), vec![], ) .await? .as_bool() .unwrap_or(false); if !exists { return Err(anyhow!( "Expected element '{}' not found on page", selector )); } } ContentCheck::TextContains(text) => { let page_text: String = client .execute("return document.body.innerText;", vec![]) .await? .as_str() .unwrap_or("") .to_string(); if !page_text.contains(&text) { return Err(anyhow!( "Expected text '{}' not found on page", text )); } } } } Ok(()) } #[derive(Debug, Clone)] pub enum ContentCheck { /// Verify that a CSS selector exists ElementExists(String), /// Verify that page body contains text TextContains(String), } /// Safe navigation wrapper that validates and clears state on failure pub async fn navigate_with_validation( client: &Client, url: &str, expected_url_fragment: &str, timeout_secs: u64, ) -> Result<()> { use tokio::time::timeout; // Attempt navigation with timeout let nav_result = timeout( Duration::from_secs(timeout_secs), client.goto(url) ).await; match nav_result { Ok(Ok(_)) => { // Navigation succeeded, verify we're on correct page verify_navigation(client, expected_url_fragment, 3).await?; Ok(()) } Ok(Err(e)) => { // Navigation failed - clear state before returning error crate::util::logger::log_error(&format!( "Navigation failed: {}. Clearing browser state...", e )).await; clear_browser_state(client).await.ok(); // Best effort Err(anyhow!("Navigation failed: {}", e)) } Err(_) => { // Navigation timed out - clear state before returning error crate::util::logger::log_error(&format!( "Navigation timeout after {}s. Clearing browser state...", timeout_secs )).await; clear_browser_state(client).await.ok(); // Best effort Err(anyhow!("Navigation timeout")) } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_content_check_variants() { let check1 = ContentCheck::ElementExists("table".to_string()); let check2 = ContentCheck::TextContains("Yahoo Finance".to_string()); match check1 { ContentCheck::ElementExists(sel) => assert_eq!(sel, "table"), _ => panic!("Wrong variant"), } match check2 { ContentCheck::TextContains(text) => assert_eq!(text, "Yahoo Finance"), _ => panic!("Wrong variant"), } } }