// src/corporate/yahoo.rs use super::{types::*, helpers::*, page_validation::*}; use crate::{scraper::webdriver::*, util::{directories::DataPaths}}; use crate::logger; use fantoccini::{Client, Locator}; use rand::Rng; use serde::{Deserialize, Serialize}; use tokio::time::{Duration as TokioDuration, sleep, timeout}; use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}}; use anyhow::{anyhow, Result}; const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js"); #[derive(Debug, Clone, Serialize, Deserialize)] pub enum YahooTickerResult { Found(String), NotFound, NoResults, AmbiguousResults, } #[derive(Debug, Deserialize)] pub struct ExtractionMetadata { #[serde(rename = "selectedRowIndex")] pub selected_row_index: usize, #[serde(rename = "validFieldCount")] pub valid_field_count: usize, #[serde(rename = "totalRows")] pub total_rows: usize, } #[derive(Debug, Deserialize)] pub struct ExtractionResult { status: String, ticker: Option, sector: Option, exchange: Option, #[serde(default)] error_message: Option, #[serde(default)] metadata: Option, } impl YahooTickerResult { pub fn to_tagged_string(&self) -> String { match self { YahooTickerResult::Found(ticker) => format!("YAHOO:{}", ticker), YahooTickerResult::NotFound => "YAHOO:NOT_FOUND".to_string(), YahooTickerResult::NoResults => "YAHOO:NO_RESULTS".to_string(), YahooTickerResult::AmbiguousResults => "YAHOO:AMBIGUOUS".to_string(), } } pub fn is_found(&self) -> bool { matches!(self, YahooTickerResult::Found(_)) } pub fn get_ticker(&self) -> Option<&str> { match self { YahooTickerResult::Found(ticker) => Some(ticker), _ => None, } } } /// UPDATED: Scrape company details with full validation and shutdown support pub async fn scrape_company_details_by_isin( pool: &Arc, isin: &str, shutdown_flag: &Arc, ) -> anyhow::Result> { // Check shutdown before starting if shutdown_flag.load(Ordering::SeqCst) { logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await; return Ok(None); } if pool.should_perform_hard_reset() { logger::log_warn("HARD_RESET_REQUIRED detected before starting ISIN scrape").await; return Err(anyhow!("HARD_RESET_REQUIRED")); } let isin_owned = isin.to_string(); let shutdown_clone = Arc::clone(shutdown_flag); let url = format!("https://finance.yahoo.com/lookup/?s={}", isin); pool.execute(url.clone(), move |client| { let isin = isin_owned.clone(); let shutdown = shutdown_clone.clone(); Box::pin(async move { // Check shutdown during task execution if shutdown.load(Ordering::SeqCst) { return Err(anyhow!("Task aborted due to shutdown")); } // Random delay let delay = rand::rng().random_range(800..1500); sleep(TokioDuration::from_millis(delay)).await; // Reject cookies reject_yahoo_cookies(&client).await?; // Check shutdown again if shutdown.load(Ordering::SeqCst) { return Err(anyhow!("Task aborted due to shutdown")); } // CRITICAL: Validate navigation succeeded let expected_fragment = format!("lookup/?s={}", isin); match verify_navigation(&client, &expected_fragment, 5).await { Ok(_) => { logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await; } Err(e) => { logger::log_error(&format!( "Navigation verification failed for ISIN {}: {}", isin, e )).await; // Clear browser state before returning error clear_browser_state(&client).await.ok(); return Err(e); } } // Additional content validation let page_ready: bool = client .execute( r#" const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table'); const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn'); return !!(table || noData); "#, vec![], ) .await? .as_bool() .unwrap_or(false); if !page_ready { logger::log_error(&format!( "Page content not ready for ISIN {} - neither table nor no-data element found", isin )).await; clear_browser_state(&client).await.ok(); return Err(anyhow!("Page content not ready")); } logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await; // Check shutdown before extraction if shutdown.load(Ordering::SeqCst) { return Err(anyhow!("Task aborted due to shutdown")); } // Random delay before extraction let delay = rand::rng().random_range(800..1500); sleep(TokioDuration::from_millis(delay)).await; // Now safe to extract extract_company_details_validated(&client, &isin).await }) }).await } /// UPDATED: Extract with additional URL validation async fn extract_company_details_validated( client: &Client, isin: &str, ) -> Result> { // Double-check URL is still correct before extraction let current_url = client.current_url().await?; if !current_url.as_str().contains(isin) { logger::log_error(&format!( "URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'", isin, current_url.as_str() )).await; clear_browser_state(client).await.ok(); return Err(anyhow!("URL mismatch - possible stale page")); } // Run extraction let result = extract_company_details(client, isin).await?; // Validate extraction result if let Some(ref details) = result { logger::log_info(&format!( "✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})", details.ticker, isin, details.sector, details.exchange )).await; } else { logger::log_info(&format!( "No ticker found for ISIN {} (legitimately not found)", isin )).await; } Ok(result) } pub async fn extract_company_details( client: &Client, _isin: &str, ) -> Result> { // Wait for page to load - look for either the table or the no-data element let wait_result: Result> = timeout( TokioDuration::from_secs(30), async { for _ in 0..60 { let has_content: bool = client .execute( r#" const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table'); const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn'); return !!(table || noData); "#, vec![], ) .await .map_err(|e| anyhow!("Execute error: {}", e))? .as_bool() .unwrap_or(false); if has_content { return Ok(true); } sleep(TokioDuration::from_millis(500)).await; } Ok(false) }, ) .await .map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load")); match wait_result { Err(_) => { return Err(anyhow!("Timeout waiting for Yahoo Finance page to load")); }, Ok(Err(e)) => { return Err(anyhow!("Error checking page content: {}", e)); }, Ok(Ok(false)) => { logger::log_warn("Page content not found after waiting, attempting extraction anyway").await; }, Ok(Ok(true)) => { logger::log_info("Page content detected, proceeding with extraction").await; } } // Execute the JavaScript extraction script let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?; // Log the raw result for debugging logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await; // Check if result is null if result.is_null() { return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed")); } // Parse the JSON result let extraction: ExtractionResult = serde_json::from_value(result.clone()) .map_err(|e| { let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result)); anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str) })?; match extraction.status.as_str() { "found" => { if let Some(ticker) = extraction.ticker { if let Some(ref metadata) = extraction.metadata { logger::log_info(&format!( "Selected row {} with {} valid fields out of {} total rows", metadata.selected_row_index, metadata.valid_field_count, metadata.total_rows )).await; } Ok(Some(YahooCompanyDetails { ticker, sector: extraction.sector, exchange: extraction.exchange, })) } else { Err(anyhow!("Status 'found' but no ticker present")) } }, "no_results" => Ok(None), "error" => { let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string()); Err(anyhow!("JavaScript extraction error: {}", error_msg)) }, _ => Ok(None), } } pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::Result> { let corporate_path = paths.data_dir().join("corporate").join("by_name"); let companies_file = corporate_path.join("companies.jsonl"); let content = tokio::fs::read_to_string(companies_file).await?; let mut tickers = Vec::new(); for line in content.lines() { let company: CompanyCrossPlatformInfo = serde_json::from_str(line)?; for (_isin, ticker_vec) in company.isin_tickers_map { tickers.extend(ticker_vec); } } Ok(tickers) } pub async fn fetch_earnings_with_pool( pool: &Arc, ticker: &str, ) -> anyhow::Result> { let ticker = ticker.to_string(); let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker); let ticker_cloned = ticker.clone(); pool.execute(url, move |client| { let ticker = ticker_cloned.clone(); Box::pin(async move { reject_yahoo_cookies(&client).await?; extract_earnings_events(&client, &ticker).await }) }).await } pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result> { // Wait for the table to load let table = client .wait() .for_element(Locator::Css(r#"table[data-test="cal-table"]"#)) .await .map_err(|e| anyhow!("Failed to find earnings table: {}", e))?; // Find all rows in tbody let rows = table .find_all(Locator::Css("tbody tr")) .await .map_err(|e| anyhow!("Failed to find table rows: {}", e))?; let mut events = Vec::with_capacity(rows.len()); for row in rows { let cells = row .find_all(Locator::Css("td")) .await .map_err(|e| anyhow!("Failed to find cells in row: {}", e))?; if cells.len() < 5 { continue; // Skip incomplete rows } // Extract and parse date let date_str = cells[0] .text() .await .map_err(|e| anyhow!("Failed to get date text: {}", e))?; let date = parse_yahoo_date(&date_str) .map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))? .format("%Y-%m-%d") .to_string(); // Extract time, replace "Time Not Supplied" with empty let time = cells[1] .text() .await .map_err(|e| anyhow!("Failed to get time text: {}", e))? .replace("Time Not Supplied", ""); // Extract period let period = cells[2] .text() .await .map_err(|e| anyhow!("Failed to get period text: {}", e))?; // Parse EPS forecast let eps_forecast_str = cells[3] .text() .await .map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?; let eps_forecast = parse_float(&eps_forecast_str); // Parse EPS actual let eps_actual_str = cells[4] .text() .await .map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?; let eps_actual = parse_float(&eps_actual_str); // Parse surprise % if available let surprise_pct = if cells.len() > 5 { let surprise_str = cells[5] .text() .await .map_err(|e| anyhow!("Failed to get surprise text: {}", e))?; parse_float(&surprise_str) } else { None }; events.push(CompanyEvent { ticker: ticker.to_string(), date, time, period, eps_forecast, eps_actual, revenue_forecast: None, revenue_actual: None, surprise_pct, source: "Yahoo".to_string(), }); } if events.is_empty() { logger::log_warn(&format!("Warning: No earnings events extracted for ticker {}", ticker)).await; } else { logger::log_info(&format!("Extracted {} earnings events for {}", events.len(), ticker)).await; } Ok(events) } /// Rejecting Yahoo Cookies async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> { for _ in 0..10 { let clicked: bool = client .execute( r#"(() => { const btn = document.querySelector('#consent-page .reject-all'); if (btn) { btn.click(); return true; } return false; })()"#, vec![], ) .await? .as_bool() .unwrap_or(false); if clicked { break; } sleep(TokioDuration::from_millis(500)).await; } logger::log_info("Rejected Yahoo cookies if button existed").await; Ok(()) }