added atomic writer action for ctr c abort

2025-12-19 14:12:56 +01:00
parent cd91de253b
commit b366f366e6
26 changed files with 3317 additions and 666 deletions
--- a/src/corporate/yahoo.rs
+++ b/src/corporate/yahoo.rs
@@ -1,18 +1,15 @@
-// src/corporate/yahoo.rs
-use super::{types::*, helpers::*};
+// src/corporate/yahoo.rs - UPDATED WITH DATA INTEGRITY FIXES
+use super::{types::*, helpers::*, page_validation::*};
 use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
-use event_backtest_engine::logger;
+use crate::logger;
 use fantoccini::{Client, Locator};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use tokio::time::{Duration as TokioDuration, sleep, timeout};
-use std::{sync::Arc};
+use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}};
 use anyhow::{anyhow, Result};

 const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
-/// Mapping existing 
-
-/// getting historical stock price data daily (xxxx - 2025) and hourly (last 30 days)

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub enum YahooTickerResult {
@@ -66,29 +63,137 @@ impl YahooTickerResult {
    }
 }

+/// UPDATED: Scrape company details with full validation and shutdown support
 pub async fn scrape_company_details_by_isin(
    pool: &Arc<ChromeDriverPool>,
    isin: &str,
+    shutdown_flag: &Arc<AtomicBool>,
 ) -> anyhow::Result<Option<YahooCompanyDetails>> {
-    let isin = isin.to_string();
-    pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
-        let isin = isin.clone();
+    // Check shutdown before starting
+    if shutdown_flag.load(Ordering::SeqCst) {
+        logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
+        return Ok(None);
+    }
+    
+    let isin_owned = isin.to_string();
+    let shutdown_clone = Arc::clone(shutdown_flag);
+    let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
+    
+    pool.execute(url.clone(), move |client| {
+        let isin = isin_owned.clone();
+        let shutdown = shutdown_clone.clone();
+        
        Box::pin(async move {
-            // Random Delay between 800-1500ms
+            // Check shutdown during task execution
+            if shutdown.load(Ordering::SeqCst) {
+                return Err(anyhow!("Task aborted due to shutdown"));
+            }
+            
+            // Random delay
            let delay = rand::rng().random_range(800..1500);
            sleep(TokioDuration::from_millis(delay)).await;
            
+            // Reject cookies
            reject_yahoo_cookies(&client).await?;
            
-            // Random Delay
+            // Check shutdown again
+            if shutdown.load(Ordering::SeqCst) {
+                return Err(anyhow!("Task aborted due to shutdown"));
+            }
+            
+            // CRITICAL: Validate navigation succeeded
+            let expected_fragment = format!("lookup/?s={}", isin);
+            match verify_navigation(&client, &expected_fragment, 5).await {
+                Ok(_) => {
+                    logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await;
+                }
+                Err(e) => {
+                    logger::log_error(&format!(
+                        "Navigation verification failed for ISIN {}: {}",
+                        isin, e
+                    )).await;
+                    // Clear browser state before returning error
+                    clear_browser_state(&client).await.ok();
+                    return Err(e);
+                }
+            }
+            
+            // Additional content validation
+            let page_ready: bool = client
+                .execute(
+                    r#"
+                    const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
+                    const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
+                    return !!(table || noData);
+                    "#,
+                    vec![],
+                )
+                .await?
+                .as_bool()
+                .unwrap_or(false);
+            
+            if !page_ready {
+                logger::log_error(&format!(
+                    "Page content not ready for ISIN {} - neither table nor no-data element found",
+                    isin
+                )).await;
+                clear_browser_state(&client).await.ok();
+                return Err(anyhow!("Page content not ready"));
+            }
+            
+            logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await;
+            
+            // Check shutdown before extraction
+            if shutdown.load(Ordering::SeqCst) {
+                return Err(anyhow!("Task aborted due to shutdown"));
+            }
+            
+            // Random delay before extraction
            let delay = rand::rng().random_range(800..1500);
            sleep(TokioDuration::from_millis(delay)).await;
            
-            extract_company_details(&client, &isin).await
+            // Now safe to extract
+            extract_company_details_validated(&client, &isin).await
        })
    }).await
 }

+/// UPDATED: Extract with additional URL validation
+async fn extract_company_details_validated(
+    client: &Client,
+    isin: &str,
+) -> Result<Option<YahooCompanyDetails>> {
+    // Double-check URL is still correct before extraction
+    let current_url = client.current_url().await?;
+    if !current_url.as_str().contains(isin) {
+        logger::log_error(&format!(
+            "URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'",
+            isin,
+            current_url.as_str()
+        )).await;
+        clear_browser_state(client).await.ok();
+        return Err(anyhow!("URL mismatch - possible stale page"));
+    }
+    
+    // Run extraction
+    let result = extract_company_details(client, isin).await?;
+    
+    // Validate extraction result
+    if let Some(ref details) = result {
+        logger::log_info(&format!(
+            "✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})",
+            details.ticker, isin, details.sector, details.exchange
+        )).await;
+    } else {
+        logger::log_info(&format!(
+            "No ticker found for ISIN {} (legitimately not found)",
+            isin
+        )).await;
+    }
+    
+    Ok(result)
+}
+
 pub async fn extract_company_details(
    client: &Client,
    _isin: &str,
@@ -153,17 +258,13 @@ pub async fn extract_company_details(
    // Parse the JSON result
    let extraction: ExtractionResult = serde_json::from_value(result.clone())
        .map_err(|e| {
-            // Log the problematic result value for debugging
            let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
            anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
        })?;
    
    match extraction.status.as_str() {
        "found" => {
-            // Ticker is guaranteed to be present when status is "found"
-            // Sector and exchange are optional
            if let Some(ticker) = extraction.ticker {
-                // Log metadata if available
                if let Some(ref metadata) = extraction.metadata {
                    logger::log_info(&format!(
                        "Selected row {} with {} valid fields out of {} total rows",
@@ -179,13 +280,11 @@ pub async fn extract_company_details(
                    exchange: extraction.exchange,
                }))
            } else {
-                // This shouldn't happen if JS script is working correctly
                Err(anyhow!("Status 'found' but no ticker present"))
            }
        },
        "no_results" => Ok(None),
        "error" => {
-            // Error status means ticker was not found or extraction failed
            let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
            Err(anyhow!("JavaScript extraction error: {}", error_msg))
        },
@@ -207,19 +306,6 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
    Ok(tickers)
 }

-/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
-///
-/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
-/// reject cookies, and extract the events.
-///
-/// # Arguments
-/// * `ticker` - The stock ticker symbol.
-///
-/// # Returns
-/// A vector of CompanyEvent structs on success.
-///
-/// # Errors
-/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
 pub async fn fetch_earnings_with_pool(
    pool: &Arc<ChromeDriverPool>,
    ticker: &str,
@@ -238,40 +324,6 @@ pub async fn fetch_earnings_with_pool(
    }).await
 }

-/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
-///
-/// This function assumes the client is already navigated to the correct URL (e.g., 
-/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
-///
-/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
-/// and handles date parsing, float parsing, and optional fields.
-///
-/// # Arguments
-/// * `client` - The fantoccini Client with the page loaded.
-/// * `ticker` - The stock ticker symbol for the events.
-///
-/// # Returns
-/// A vector of CompanyEvent on success.
-///
-/// # Errors
-/// Returns an error if:
-/// - Table or elements not found.
-/// - Date or float parsing fails.
-/// - WebDriver operations fail.
-///
-/// # Examples
-///
-/// ```no_run
-/// use fantoccini::Client;
-/// use crate::corporate::scraper::extract_earnings;
-///
-/// #[tokio::main]
-/// async fn main() -> Result<()> {
-///     // Assume client is set up and navigated
-///     let events = extract_earnings(&client, "AAPL").await?;
-///     Ok(())
-/// }
-/// ```
 pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
    // Wait for the table to load
    let table = client