Files
WebScraper/src/corporate/page_validation.rs

180 lines
5.6 KiB
Rust

// src/corporate/page_validation.rs
//
// Utilities to ensure page state is correct before extraction
use anyhow::{anyhow, Result};
use fantoccini::Client;
use tokio::time::{sleep, Duration};
/// Validates that the browser navigated to the expected URL
///
/// This prevents extracting data from a stale page when navigation fails silently
pub async fn verify_navigation(
client: &Client,
expected_url_fragment: &str,
max_attempts: u32,
) -> Result<()> {
for attempt in 1..=max_attempts {
let current_url = client.current_url().await?;
let current = current_url.as_str();
if current.contains(expected_url_fragment) {
crate::util::logger::log_info(&format!(
"✓ Navigation verified: {} (attempt {})",
current, attempt
)).await;
return Ok(());
}
if attempt < max_attempts {
crate::util::logger::log_warn(&format!(
"Navigation mismatch (attempt {}): expected '{}', got '{}'. Retrying...",
attempt, expected_url_fragment, current
)).await;
sleep(Duration::from_millis(500)).await;
}
}
let current_url = client.current_url().await?;
Err(anyhow!(
"Navigation verification failed: expected URL containing '{}', but got '{}'",
expected_url_fragment,
current_url.as_str()
))
}
/// Clears browser state by navigating to a blank page
///
/// Use this when a navigation fails or times out to ensure clean slate
pub async fn clear_browser_state(client: &Client) -> Result<()> {
crate::util::logger::log_info("Clearing browser state with about:blank").await;
// Navigate to blank page to clear any stale content
client.goto("about:blank").await?;
// Brief wait to ensure page clears
sleep(Duration::from_millis(200)).await;
Ok(())
}
/// Validates that expected content exists on the page before extraction
///
/// This adds an extra safety check that the page actually loaded
pub async fn verify_page_content(
client: &Client,
content_checks: Vec<ContentCheck>,
) -> Result<()> {
for check in content_checks {
match check {
ContentCheck::ElementExists(selector) => {
let exists: bool = client
.execute(
&format!(
"return !!document.querySelector('{}');",
selector.replace("'", "\\'")
),
vec![],
)
.await?
.as_bool()
.unwrap_or(false);
if !exists {
return Err(anyhow!(
"Expected element '{}' not found on page",
selector
));
}
}
ContentCheck::TextContains(text) => {
let page_text: String = client
.execute("return document.body.innerText;", vec![])
.await?
.as_str()
.unwrap_or("")
.to_string();
if !page_text.contains(&text) {
return Err(anyhow!(
"Expected text '{}' not found on page",
text
));
}
}
}
}
Ok(())
}
#[derive(Debug, Clone)]
pub enum ContentCheck {
/// Verify that a CSS selector exists
ElementExists(String),
/// Verify that page body contains text
TextContains(String),
}
/// Safe navigation wrapper that validates and clears state on failure
pub async fn navigate_with_validation(
client: &Client,
url: &str,
expected_url_fragment: &str,
timeout_secs: u64,
) -> Result<()> {
use tokio::time::timeout;
// Attempt navigation with timeout
let nav_result = timeout(
Duration::from_secs(timeout_secs),
client.goto(url)
).await;
match nav_result {
Ok(Ok(_)) => {
// Navigation succeeded, verify we're on correct page
verify_navigation(client, expected_url_fragment, 3).await?;
Ok(())
}
Ok(Err(e)) => {
// Navigation failed - clear state before returning error
crate::util::logger::log_error(&format!(
"Navigation failed: {}. Clearing browser state...",
e
)).await;
clear_browser_state(client).await.ok(); // Best effort
Err(anyhow!("Navigation failed: {}", e))
}
Err(_) => {
// Navigation timed out - clear state before returning error
crate::util::logger::log_error(&format!(
"Navigation timeout after {}s. Clearing browser state...",
timeout_secs
)).await;
clear_browser_state(client).await.ok(); // Best effort
Err(anyhow!("Navigation timeout"))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_check_variants() {
let check1 = ContentCheck::ElementExists("table".to_string());
let check2 = ContentCheck::TextContains("Yahoo Finance".to_string());
match check1 {
ContentCheck::ElementExists(sel) => assert_eq!(sel, "table"),
_ => panic!("Wrong variant"),
}
match check2 {
ContentCheck::TextContains(text) => assert_eq!(text, "Yahoo Finance"),
_ => panic!("Wrong variant"),
}
}
}