added atomic writer action for ctr c abort

This commit is contained in:
2025-12-19 14:12:56 +01:00
parent cd91de253b
commit b366f366e6
26 changed files with 3317 additions and 666 deletions

View File

@@ -1,18 +1,15 @@
// src/corporate/yahoo.rs
use super::{types::*, helpers::*};
// src/corporate/yahoo.rs - UPDATED WITH DATA INTEGRITY FIXES
use super::{types::*, helpers::*, page_validation::*};
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
use event_backtest_engine::logger;
use crate::logger;
use fantoccini::{Client, Locator};
use rand::Rng;
use serde::{Deserialize, Serialize};
use tokio::time::{Duration as TokioDuration, sleep, timeout};
use std::{sync::Arc};
use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}};
use anyhow::{anyhow, Result};
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
/// Mapping existing
/// getting historical stock price data daily (xxxx - 2025) and hourly (last 30 days)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum YahooTickerResult {
@@ -66,29 +63,137 @@ impl YahooTickerResult {
}
}
/// UPDATED: Scrape company details with full validation and shutdown support
pub async fn scrape_company_details_by_isin(
pool: &Arc<ChromeDriverPool>,
isin: &str,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<YahooCompanyDetails>> {
let isin = isin.to_string();
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
let isin = isin.clone();
// Check shutdown before starting
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
return Ok(None);
}
let isin_owned = isin.to_string();
let shutdown_clone = Arc::clone(shutdown_flag);
let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
pool.execute(url.clone(), move |client| {
let isin = isin_owned.clone();
let shutdown = shutdown_clone.clone();
Box::pin(async move {
// Random Delay between 800-1500ms
// Check shutdown during task execution
if shutdown.load(Ordering::SeqCst) {
return Err(anyhow!("Task aborted due to shutdown"));
}
// Random delay
let delay = rand::rng().random_range(800..1500);
sleep(TokioDuration::from_millis(delay)).await;
// Reject cookies
reject_yahoo_cookies(&client).await?;
// Random Delay
// Check shutdown again
if shutdown.load(Ordering::SeqCst) {
return Err(anyhow!("Task aborted due to shutdown"));
}
// CRITICAL: Validate navigation succeeded
let expected_fragment = format!("lookup/?s={}", isin);
match verify_navigation(&client, &expected_fragment, 5).await {
Ok(_) => {
logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await;
}
Err(e) => {
logger::log_error(&format!(
"Navigation verification failed for ISIN {}: {}",
isin, e
)).await;
// Clear browser state before returning error
clear_browser_state(&client).await.ok();
return Err(e);
}
}
// Additional content validation
let page_ready: bool = client
.execute(
r#"
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
return !!(table || noData);
"#,
vec![],
)
.await?
.as_bool()
.unwrap_or(false);
if !page_ready {
logger::log_error(&format!(
"Page content not ready for ISIN {} - neither table nor no-data element found",
isin
)).await;
clear_browser_state(&client).await.ok();
return Err(anyhow!("Page content not ready"));
}
logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await;
// Check shutdown before extraction
if shutdown.load(Ordering::SeqCst) {
return Err(anyhow!("Task aborted due to shutdown"));
}
// Random delay before extraction
let delay = rand::rng().random_range(800..1500);
sleep(TokioDuration::from_millis(delay)).await;
extract_company_details(&client, &isin).await
// Now safe to extract
extract_company_details_validated(&client, &isin).await
})
}).await
}
/// UPDATED: Extract with additional URL validation
async fn extract_company_details_validated(
client: &Client,
isin: &str,
) -> Result<Option<YahooCompanyDetails>> {
// Double-check URL is still correct before extraction
let current_url = client.current_url().await?;
if !current_url.as_str().contains(isin) {
logger::log_error(&format!(
"URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'",
isin,
current_url.as_str()
)).await;
clear_browser_state(client).await.ok();
return Err(anyhow!("URL mismatch - possible stale page"));
}
// Run extraction
let result = extract_company_details(client, isin).await?;
// Validate extraction result
if let Some(ref details) = result {
logger::log_info(&format!(
"✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})",
details.ticker, isin, details.sector, details.exchange
)).await;
} else {
logger::log_info(&format!(
"No ticker found for ISIN {} (legitimately not found)",
isin
)).await;
}
Ok(result)
}
pub async fn extract_company_details(
client: &Client,
_isin: &str,
@@ -153,17 +258,13 @@ pub async fn extract_company_details(
// Parse the JSON result
let extraction: ExtractionResult = serde_json::from_value(result.clone())
.map_err(|e| {
// Log the problematic result value for debugging
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
})?;
match extraction.status.as_str() {
"found" => {
// Ticker is guaranteed to be present when status is "found"
// Sector and exchange are optional
if let Some(ticker) = extraction.ticker {
// Log metadata if available
if let Some(ref metadata) = extraction.metadata {
logger::log_info(&format!(
"Selected row {} with {} valid fields out of {} total rows",
@@ -179,13 +280,11 @@ pub async fn extract_company_details(
exchange: extraction.exchange,
}))
} else {
// This shouldn't happen if JS script is working correctly
Err(anyhow!("Status 'found' but no ticker present"))
}
},
"no_results" => Ok(None),
"error" => {
// Error status means ticker was not found or extraction failed
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
Err(anyhow!("JavaScript extraction error: {}", error_msg))
},
@@ -207,19 +306,6 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
Ok(tickers)
}
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
///
/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
/// reject cookies, and extract the events.
///
/// # Arguments
/// * `ticker` - The stock ticker symbol.
///
/// # Returns
/// A vector of CompanyEvent structs on success.
///
/// # Errors
/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
pub async fn fetch_earnings_with_pool(
pool: &Arc<ChromeDriverPool>,
ticker: &str,
@@ -238,40 +324,6 @@ pub async fn fetch_earnings_with_pool(
}).await
}
/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
///
/// This function assumes the client is already navigated to the correct URL (e.g.,
/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
///
/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
/// and handles date parsing, float parsing, and optional fields.
///
/// # Arguments
/// * `client` - The fantoccini Client with the page loaded.
/// * `ticker` - The stock ticker symbol for the events.
///
/// # Returns
/// A vector of CompanyEvent on success.
///
/// # Errors
/// Returns an error if:
/// - Table or elements not found.
/// - Date or float parsing fails.
/// - WebDriver operations fail.
///
/// # Examples
///
/// ```no_run
/// use fantoccini::Client;
/// use crate::corporate::scraper::extract_earnings;
///
/// #[tokio::main]
/// async fn main() -> Result<()> {
/// // Assume client is set up and navigated
/// let events = extract_earnings(&client, "AAPL").await?;
/// Ok(())
/// }
/// ```
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
// Wait for the table to load
let table = client