added atomic writer action for ctr c abort
This commit is contained in:
180
src/corporate/page_validation.rs
Normal file
180
src/corporate/page_validation.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
// src/corporate/page_validation.rs
|
||||
//
|
||||
// Utilities to ensure page state is correct before extraction
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use fantoccini::Client;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
/// Validates that the browser navigated to the expected URL
|
||||
///
|
||||
/// This prevents extracting data from a stale page when navigation fails silently
|
||||
pub async fn verify_navigation(
|
||||
client: &Client,
|
||||
expected_url_fragment: &str,
|
||||
max_attempts: u32,
|
||||
) -> Result<()> {
|
||||
for attempt in 1..=max_attempts {
|
||||
let current_url = client.current_url().await?;
|
||||
let current = current_url.as_str();
|
||||
|
||||
if current.contains(expected_url_fragment) {
|
||||
crate::util::logger::log_info(&format!(
|
||||
"✓ Navigation verified: {} (attempt {})",
|
||||
current, attempt
|
||||
)).await;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if attempt < max_attempts {
|
||||
crate::util::logger::log_warn(&format!(
|
||||
"Navigation mismatch (attempt {}): expected '{}', got '{}'. Retrying...",
|
||||
attempt, expected_url_fragment, current
|
||||
)).await;
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
}
|
||||
|
||||
let current_url = client.current_url().await?;
|
||||
Err(anyhow!(
|
||||
"Navigation verification failed: expected URL containing '{}', but got '{}'",
|
||||
expected_url_fragment,
|
||||
current_url.as_str()
|
||||
))
|
||||
}
|
||||
|
||||
/// Clears browser state by navigating to a blank page
|
||||
///
|
||||
/// Use this when a navigation fails or times out to ensure clean slate
|
||||
pub async fn clear_browser_state(client: &Client) -> Result<()> {
|
||||
crate::util::logger::log_info("Clearing browser state with about:blank").await;
|
||||
|
||||
// Navigate to blank page to clear any stale content
|
||||
client.goto("about:blank").await?;
|
||||
|
||||
// Brief wait to ensure page clears
|
||||
sleep(Duration::from_millis(200)).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validates that expected content exists on the page before extraction
|
||||
///
|
||||
/// This adds an extra safety check that the page actually loaded
|
||||
pub async fn verify_page_content(
|
||||
client: &Client,
|
||||
content_checks: Vec<ContentCheck>,
|
||||
) -> Result<()> {
|
||||
for check in content_checks {
|
||||
match check {
|
||||
ContentCheck::ElementExists(selector) => {
|
||||
let exists: bool = client
|
||||
.execute(
|
||||
&format!(
|
||||
"return !!document.querySelector('{}');",
|
||||
selector.replace("'", "\\'")
|
||||
),
|
||||
vec![],
|
||||
)
|
||||
.await?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
|
||||
if !exists {
|
||||
return Err(anyhow!(
|
||||
"Expected element '{}' not found on page",
|
||||
selector
|
||||
));
|
||||
}
|
||||
}
|
||||
ContentCheck::TextContains(text) => {
|
||||
let page_text: String = client
|
||||
.execute("return document.body.innerText;", vec![])
|
||||
.await?
|
||||
.as_str()
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
if !page_text.contains(&text) {
|
||||
return Err(anyhow!(
|
||||
"Expected text '{}' not found on page",
|
||||
text
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ContentCheck {
|
||||
/// Verify that a CSS selector exists
|
||||
ElementExists(String),
|
||||
/// Verify that page body contains text
|
||||
TextContains(String),
|
||||
}
|
||||
|
||||
/// Safe navigation wrapper that validates and clears state on failure
|
||||
pub async fn navigate_with_validation(
|
||||
client: &Client,
|
||||
url: &str,
|
||||
expected_url_fragment: &str,
|
||||
timeout_secs: u64,
|
||||
) -> Result<()> {
|
||||
use tokio::time::timeout;
|
||||
|
||||
// Attempt navigation with timeout
|
||||
let nav_result = timeout(
|
||||
Duration::from_secs(timeout_secs),
|
||||
client.goto(url)
|
||||
).await;
|
||||
|
||||
match nav_result {
|
||||
Ok(Ok(_)) => {
|
||||
// Navigation succeeded, verify we're on correct page
|
||||
verify_navigation(client, expected_url_fragment, 3).await?;
|
||||
Ok(())
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
// Navigation failed - clear state before returning error
|
||||
crate::util::logger::log_error(&format!(
|
||||
"Navigation failed: {}. Clearing browser state...",
|
||||
e
|
||||
)).await;
|
||||
clear_browser_state(client).await.ok(); // Best effort
|
||||
Err(anyhow!("Navigation failed: {}", e))
|
||||
}
|
||||
Err(_) => {
|
||||
// Navigation timed out - clear state before returning error
|
||||
crate::util::logger::log_error(&format!(
|
||||
"Navigation timeout after {}s. Clearing browser state...",
|
||||
timeout_secs
|
||||
)).await;
|
||||
clear_browser_state(client).await.ok(); // Best effort
|
||||
Err(anyhow!("Navigation timeout"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_content_check_variants() {
|
||||
let check1 = ContentCheck::ElementExists("table".to_string());
|
||||
let check2 = ContentCheck::TextContains("Yahoo Finance".to_string());
|
||||
|
||||
match check1 {
|
||||
ContentCheck::ElementExists(sel) => assert_eq!(sel, "table"),
|
||||
_ => panic!("Wrong variant"),
|
||||
}
|
||||
|
||||
match check2 {
|
||||
ContentCheck::TextContains(text) => assert_eq!(text, "Yahoo Finance"),
|
||||
_ => panic!("Wrong variant"),
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user