180 lines
5.6 KiB
Rust
180 lines
5.6 KiB
Rust
// src/corporate/page_validation.rs
|
|
//
|
|
// Utilities to ensure page state is correct before extraction
|
|
|
|
use anyhow::{anyhow, Result};
|
|
use fantoccini::Client;
|
|
use tokio::time::{sleep, Duration};
|
|
|
|
/// Validates that the browser navigated to the expected URL
|
|
///
|
|
/// This prevents extracting data from a stale page when navigation fails silently
|
|
pub async fn verify_navigation(
|
|
client: &Client,
|
|
expected_url_fragment: &str,
|
|
max_attempts: u32,
|
|
) -> Result<()> {
|
|
for attempt in 1..=max_attempts {
|
|
let current_url = client.current_url().await?;
|
|
let current = current_url.as_str();
|
|
|
|
if current.contains(expected_url_fragment) {
|
|
crate::util::logger::log_info(&format!(
|
|
"✓ Navigation verified: {} (attempt {})",
|
|
current, attempt
|
|
)).await;
|
|
return Ok(());
|
|
}
|
|
|
|
if attempt < max_attempts {
|
|
crate::util::logger::log_warn(&format!(
|
|
"Navigation mismatch (attempt {}): expected '{}', got '{}'. Retrying...",
|
|
attempt, expected_url_fragment, current
|
|
)).await;
|
|
sleep(Duration::from_millis(500)).await;
|
|
}
|
|
}
|
|
|
|
let current_url = client.current_url().await?;
|
|
Err(anyhow!(
|
|
"Navigation verification failed: expected URL containing '{}', but got '{}'",
|
|
expected_url_fragment,
|
|
current_url.as_str()
|
|
))
|
|
}
|
|
|
|
/// Clears browser state by navigating to a blank page
|
|
///
|
|
/// Use this when a navigation fails or times out to ensure clean slate
|
|
pub async fn clear_browser_state(client: &Client) -> Result<()> {
|
|
crate::util::logger::log_info("Clearing browser state with about:blank").await;
|
|
|
|
// Navigate to blank page to clear any stale content
|
|
client.goto("about:blank").await?;
|
|
|
|
// Brief wait to ensure page clears
|
|
sleep(Duration::from_millis(200)).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Validates that expected content exists on the page before extraction
|
|
///
|
|
/// This adds an extra safety check that the page actually loaded
|
|
pub async fn verify_page_content(
|
|
client: &Client,
|
|
content_checks: Vec<ContentCheck>,
|
|
) -> Result<()> {
|
|
for check in content_checks {
|
|
match check {
|
|
ContentCheck::ElementExists(selector) => {
|
|
let exists: bool = client
|
|
.execute(
|
|
&format!(
|
|
"return !!document.querySelector('{}');",
|
|
selector.replace("'", "\\'")
|
|
),
|
|
vec![],
|
|
)
|
|
.await?
|
|
.as_bool()
|
|
.unwrap_or(false);
|
|
|
|
if !exists {
|
|
return Err(anyhow!(
|
|
"Expected element '{}' not found on page",
|
|
selector
|
|
));
|
|
}
|
|
}
|
|
ContentCheck::TextContains(text) => {
|
|
let page_text: String = client
|
|
.execute("return document.body.innerText;", vec![])
|
|
.await?
|
|
.as_str()
|
|
.unwrap_or("")
|
|
.to_string();
|
|
|
|
if !page_text.contains(&text) {
|
|
return Err(anyhow!(
|
|
"Expected text '{}' not found on page",
|
|
text
|
|
));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub enum ContentCheck {
|
|
/// Verify that a CSS selector exists
|
|
ElementExists(String),
|
|
/// Verify that page body contains text
|
|
TextContains(String),
|
|
}
|
|
|
|
/// Safe navigation wrapper that validates and clears state on failure
|
|
pub async fn navigate_with_validation(
|
|
client: &Client,
|
|
url: &str,
|
|
expected_url_fragment: &str,
|
|
timeout_secs: u64,
|
|
) -> Result<()> {
|
|
use tokio::time::timeout;
|
|
|
|
// Attempt navigation with timeout
|
|
let nav_result = timeout(
|
|
Duration::from_secs(timeout_secs),
|
|
client.goto(url)
|
|
).await;
|
|
|
|
match nav_result {
|
|
Ok(Ok(_)) => {
|
|
// Navigation succeeded, verify we're on correct page
|
|
verify_navigation(client, expected_url_fragment, 3).await?;
|
|
Ok(())
|
|
}
|
|
Ok(Err(e)) => {
|
|
// Navigation failed - clear state before returning error
|
|
crate::util::logger::log_error(&format!(
|
|
"Navigation failed: {}. Clearing browser state...",
|
|
e
|
|
)).await;
|
|
clear_browser_state(client).await.ok(); // Best effort
|
|
Err(anyhow!("Navigation failed: {}", e))
|
|
}
|
|
Err(_) => {
|
|
// Navigation timed out - clear state before returning error
|
|
crate::util::logger::log_error(&format!(
|
|
"Navigation timeout after {}s. Clearing browser state...",
|
|
timeout_secs
|
|
)).await;
|
|
clear_browser_state(client).await.ok(); // Best effort
|
|
Err(anyhow!("Navigation timeout"))
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_content_check_variants() {
|
|
let check1 = ContentCheck::ElementExists("table".to_string());
|
|
let check2 = ContentCheck::TextContains("Yahoo Finance".to_string());
|
|
|
|
match check1 {
|
|
ContentCheck::ElementExists(sel) => assert_eq!(sel, "table"),
|
|
_ => panic!("Wrong variant"),
|
|
}
|
|
|
|
match check2 {
|
|
ContentCheck::TextContains(text) => assert_eq!(text, "Yahoo Finance"),
|
|
_ => panic!("Wrong variant"),
|
|
}
|
|
}
|
|
} |