added atomic writer action for ctr c abort
This commit is contained in:
@@ -1,18 +1,15 @@
|
||||
// src/corporate/yahoo.rs
|
||||
use super::{types::*, helpers::*};
|
||||
// src/corporate/yahoo.rs - UPDATED WITH DATA INTEGRITY FIXES
|
||||
use super::{types::*, helpers::*, page_validation::*};
|
||||
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
||||
use event_backtest_engine::logger;
|
||||
use crate::logger;
|
||||
use fantoccini::{Client, Locator};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
||||
use std::{sync::Arc};
|
||||
use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}};
|
||||
use anyhow::{anyhow, Result};
|
||||
|
||||
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
|
||||
/// Mapping existing
|
||||
|
||||
/// getting historical stock price data daily (xxxx - 2025) and hourly (last 30 days)
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum YahooTickerResult {
|
||||
@@ -66,29 +63,137 @@ impl YahooTickerResult {
|
||||
}
|
||||
}
|
||||
|
||||
/// UPDATED: Scrape company details with full validation and shutdown support
|
||||
pub async fn scrape_company_details_by_isin(
|
||||
pool: &Arc<ChromeDriverPool>,
|
||||
isin: &str,
|
||||
shutdown_flag: &Arc<AtomicBool>,
|
||||
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
||||
let isin = isin.to_string();
|
||||
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
|
||||
let isin = isin.clone();
|
||||
// Check shutdown before starting
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let isin_owned = isin.to_string();
|
||||
let shutdown_clone = Arc::clone(shutdown_flag);
|
||||
let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
|
||||
|
||||
pool.execute(url.clone(), move |client| {
|
||||
let isin = isin_owned.clone();
|
||||
let shutdown = shutdown_clone.clone();
|
||||
|
||||
Box::pin(async move {
|
||||
// Random Delay between 800-1500ms
|
||||
// Check shutdown during task execution
|
||||
if shutdown.load(Ordering::SeqCst) {
|
||||
return Err(anyhow!("Task aborted due to shutdown"));
|
||||
}
|
||||
|
||||
// Random delay
|
||||
let delay = rand::rng().random_range(800..1500);
|
||||
sleep(TokioDuration::from_millis(delay)).await;
|
||||
|
||||
// Reject cookies
|
||||
reject_yahoo_cookies(&client).await?;
|
||||
|
||||
// Random Delay
|
||||
// Check shutdown again
|
||||
if shutdown.load(Ordering::SeqCst) {
|
||||
return Err(anyhow!("Task aborted due to shutdown"));
|
||||
}
|
||||
|
||||
// CRITICAL: Validate navigation succeeded
|
||||
let expected_fragment = format!("lookup/?s={}", isin);
|
||||
match verify_navigation(&client, &expected_fragment, 5).await {
|
||||
Ok(_) => {
|
||||
logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await;
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_error(&format!(
|
||||
"Navigation verification failed for ISIN {}: {}",
|
||||
isin, e
|
||||
)).await;
|
||||
// Clear browser state before returning error
|
||||
clear_browser_state(&client).await.ok();
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
// Additional content validation
|
||||
let page_ready: bool = client
|
||||
.execute(
|
||||
r#"
|
||||
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
|
||||
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
|
||||
return !!(table || noData);
|
||||
"#,
|
||||
vec![],
|
||||
)
|
||||
.await?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
|
||||
if !page_ready {
|
||||
logger::log_error(&format!(
|
||||
"Page content not ready for ISIN {} - neither table nor no-data element found",
|
||||
isin
|
||||
)).await;
|
||||
clear_browser_state(&client).await.ok();
|
||||
return Err(anyhow!("Page content not ready"));
|
||||
}
|
||||
|
||||
logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await;
|
||||
|
||||
// Check shutdown before extraction
|
||||
if shutdown.load(Ordering::SeqCst) {
|
||||
return Err(anyhow!("Task aborted due to shutdown"));
|
||||
}
|
||||
|
||||
// Random delay before extraction
|
||||
let delay = rand::rng().random_range(800..1500);
|
||||
sleep(TokioDuration::from_millis(delay)).await;
|
||||
|
||||
extract_company_details(&client, &isin).await
|
||||
// Now safe to extract
|
||||
extract_company_details_validated(&client, &isin).await
|
||||
})
|
||||
}).await
|
||||
}
|
||||
|
||||
/// UPDATED: Extract with additional URL validation
|
||||
async fn extract_company_details_validated(
|
||||
client: &Client,
|
||||
isin: &str,
|
||||
) -> Result<Option<YahooCompanyDetails>> {
|
||||
// Double-check URL is still correct before extraction
|
||||
let current_url = client.current_url().await?;
|
||||
if !current_url.as_str().contains(isin) {
|
||||
logger::log_error(&format!(
|
||||
"URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'",
|
||||
isin,
|
||||
current_url.as_str()
|
||||
)).await;
|
||||
clear_browser_state(client).await.ok();
|
||||
return Err(anyhow!("URL mismatch - possible stale page"));
|
||||
}
|
||||
|
||||
// Run extraction
|
||||
let result = extract_company_details(client, isin).await?;
|
||||
|
||||
// Validate extraction result
|
||||
if let Some(ref details) = result {
|
||||
logger::log_info(&format!(
|
||||
"✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})",
|
||||
details.ticker, isin, details.sector, details.exchange
|
||||
)).await;
|
||||
} else {
|
||||
logger::log_info(&format!(
|
||||
"No ticker found for ISIN {} (legitimately not found)",
|
||||
isin
|
||||
)).await;
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub async fn extract_company_details(
|
||||
client: &Client,
|
||||
_isin: &str,
|
||||
@@ -153,17 +258,13 @@ pub async fn extract_company_details(
|
||||
// Parse the JSON result
|
||||
let extraction: ExtractionResult = serde_json::from_value(result.clone())
|
||||
.map_err(|e| {
|
||||
// Log the problematic result value for debugging
|
||||
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
|
||||
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
|
||||
})?;
|
||||
|
||||
match extraction.status.as_str() {
|
||||
"found" => {
|
||||
// Ticker is guaranteed to be present when status is "found"
|
||||
// Sector and exchange are optional
|
||||
if let Some(ticker) = extraction.ticker {
|
||||
// Log metadata if available
|
||||
if let Some(ref metadata) = extraction.metadata {
|
||||
logger::log_info(&format!(
|
||||
"Selected row {} with {} valid fields out of {} total rows",
|
||||
@@ -179,13 +280,11 @@ pub async fn extract_company_details(
|
||||
exchange: extraction.exchange,
|
||||
}))
|
||||
} else {
|
||||
// This shouldn't happen if JS script is working correctly
|
||||
Err(anyhow!("Status 'found' but no ticker present"))
|
||||
}
|
||||
},
|
||||
"no_results" => Ok(None),
|
||||
"error" => {
|
||||
// Error status means ticker was not found or extraction failed
|
||||
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
|
||||
Err(anyhow!("JavaScript extraction error: {}", error_msg))
|
||||
},
|
||||
@@ -207,19 +306,6 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
|
||||
Ok(tickers)
|
||||
}
|
||||
|
||||
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
||||
///
|
||||
/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
|
||||
/// reject cookies, and extract the events.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `ticker` - The stock ticker symbol.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of CompanyEvent structs on success.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
|
||||
pub async fn fetch_earnings_with_pool(
|
||||
pool: &Arc<ChromeDriverPool>,
|
||||
ticker: &str,
|
||||
@@ -238,40 +324,6 @@ pub async fn fetch_earnings_with_pool(
|
||||
}).await
|
||||
}
|
||||
|
||||
/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
|
||||
///
|
||||
/// This function assumes the client is already navigated to the correct URL (e.g.,
|
||||
/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
|
||||
///
|
||||
/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
|
||||
/// and handles date parsing, float parsing, and optional fields.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `client` - The fantoccini Client with the page loaded.
|
||||
/// * `ticker` - The stock ticker symbol for the events.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of CompanyEvent on success.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if:
|
||||
/// - Table or elements not found.
|
||||
/// - Date or float parsing fails.
|
||||
/// - WebDriver operations fail.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// use fantoccini::Client;
|
||||
/// use crate::corporate::scraper::extract_earnings;
|
||||
///
|
||||
/// #[tokio::main]
|
||||
/// async fn main() -> Result<()> {
|
||||
/// // Assume client is set up and navigated
|
||||
/// let events = extract_earnings(&client, "AAPL").await?;
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
|
||||
// Wait for the table to load
|
||||
let table = client
|
||||
|
||||
Reference in New Issue
Block a user