Files
WebScraper/src/corporate/yahoo.rs

454 lines
15 KiB
Rust

// src/corporate/yahoo.rs
use super::{types::*, helpers::*, page_validation::*};
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
use crate::logger;
use fantoccini::{Client, Locator};
use rand::Rng;
use serde::{Deserialize, Serialize};
use tokio::time::{Duration as TokioDuration, sleep, timeout};
use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}};
use anyhow::{anyhow, Result};
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum YahooTickerResult {
Found(String),
NotFound,
NoResults,
AmbiguousResults,
}
#[derive(Debug, Deserialize)]
pub struct ExtractionMetadata {
#[serde(rename = "selectedRowIndex")]
pub selected_row_index: usize,
#[serde(rename = "validFieldCount")]
pub valid_field_count: usize,
#[serde(rename = "totalRows")]
pub total_rows: usize,
}
#[derive(Debug, Deserialize)]
pub struct ExtractionResult {
status: String,
ticker: Option<String>,
sector: Option<String>,
exchange: Option<String>,
#[serde(default)]
error_message: Option<String>,
#[serde(default)]
metadata: Option<ExtractionMetadata>,
}
impl YahooTickerResult {
pub fn to_tagged_string(&self) -> String {
match self {
YahooTickerResult::Found(ticker) => format!("YAHOO:{}", ticker),
YahooTickerResult::NotFound => "YAHOO:NOT_FOUND".to_string(),
YahooTickerResult::NoResults => "YAHOO:NO_RESULTS".to_string(),
YahooTickerResult::AmbiguousResults => "YAHOO:AMBIGUOUS".to_string(),
}
}
pub fn is_found(&self) -> bool {
matches!(self, YahooTickerResult::Found(_))
}
pub fn get_ticker(&self) -> Option<&str> {
match self {
YahooTickerResult::Found(ticker) => Some(ticker),
_ => None,
}
}
}
/// UPDATED: Scrape company details with full validation and shutdown support
pub async fn scrape_company_details_by_isin(
pool: &Arc<ChromeDriverPool>,
isin: &str,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<YahooCompanyDetails>> {
// Check shutdown before starting
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
return Ok(None);
}
if pool.should_perform_hard_reset() {
logger::log_warn("HARD_RESET_REQUIRED detected before starting ISIN scrape").await;
return Err(anyhow!("HARD_RESET_REQUIRED"));
}
let isin_owned = isin.to_string();
let shutdown_clone = Arc::clone(shutdown_flag);
let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
pool.execute(url.clone(), move |client| {
let isin = isin_owned.clone();
let shutdown = shutdown_clone.clone();
Box::pin(async move {
// Check shutdown during task execution
if shutdown.load(Ordering::SeqCst) {
return Err(anyhow!("Task aborted due to shutdown"));
}
// Random delay
let delay = rand::rng().random_range(800..1500);
sleep(TokioDuration::from_millis(delay)).await;
// Reject cookies
reject_yahoo_cookies(&client).await?;
// Check shutdown again
if shutdown.load(Ordering::SeqCst) {
return Err(anyhow!("Task aborted due to shutdown"));
}
// CRITICAL: Validate navigation succeeded
let expected_fragment = format!("lookup/?s={}", isin);
match verify_navigation(&client, &expected_fragment, 5).await {
Ok(_) => {
logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await;
}
Err(e) => {
logger::log_error(&format!(
"Navigation verification failed for ISIN {}: {}",
isin, e
)).await;
// Clear browser state before returning error
clear_browser_state(&client).await.ok();
return Err(e);
}
}
// Additional content validation
let page_ready: bool = client
.execute(
r#"
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
return !!(table || noData);
"#,
vec![],
)
.await?
.as_bool()
.unwrap_or(false);
if !page_ready {
logger::log_error(&format!(
"Page content not ready for ISIN {} - neither table nor no-data element found",
isin
)).await;
clear_browser_state(&client).await.ok();
return Err(anyhow!("Page content not ready"));
}
logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await;
// Check shutdown before extraction
if shutdown.load(Ordering::SeqCst) {
return Err(anyhow!("Task aborted due to shutdown"));
}
// Random delay before extraction
let delay = rand::rng().random_range(800..1500);
sleep(TokioDuration::from_millis(delay)).await;
// Now safe to extract
extract_company_details_validated(&client, &isin).await
})
}).await
}
/// UPDATED: Extract with additional URL validation
async fn extract_company_details_validated(
client: &Client,
isin: &str,
) -> Result<Option<YahooCompanyDetails>> {
// Double-check URL is still correct before extraction
let current_url = client.current_url().await?;
if !current_url.as_str().contains(isin) {
logger::log_error(&format!(
"URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'",
isin,
current_url.as_str()
)).await;
clear_browser_state(client).await.ok();
return Err(anyhow!("URL mismatch - possible stale page"));
}
// Run extraction
let result = extract_company_details(client, isin).await?;
// Validate extraction result
if let Some(ref details) = result {
logger::log_info(&format!(
"✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})",
details.ticker, isin, details.sector, details.exchange
)).await;
} else {
logger::log_info(&format!(
"No ticker found for ISIN {} (legitimately not found)",
isin
)).await;
}
Ok(result)
}
pub async fn extract_company_details(
client: &Client,
_isin: &str,
) -> Result<Option<YahooCompanyDetails>> {
// Wait for page to load - look for either the table or the no-data element
let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
TokioDuration::from_secs(30),
async {
for _ in 0..60 {
let has_content: bool = client
.execute(
r#"
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
return !!(table || noData);
"#,
vec![],
)
.await
.map_err(|e| anyhow!("Execute error: {}", e))?
.as_bool()
.unwrap_or(false);
if has_content {
return Ok(true);
}
sleep(TokioDuration::from_millis(500)).await;
}
Ok(false)
},
)
.await
.map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load"));
match wait_result {
Err(_) => {
return Err(anyhow!("Timeout waiting for Yahoo Finance page to load"));
},
Ok(Err(e)) => {
return Err(anyhow!("Error checking page content: {}", e));
},
Ok(Ok(false)) => {
logger::log_warn("Page content not found after waiting, attempting extraction anyway").await;
},
Ok(Ok(true)) => {
logger::log_info("Page content detected, proceeding with extraction").await;
}
}
// Execute the JavaScript extraction script
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
// Log the raw result for debugging
logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await;
// Check if result is null
if result.is_null() {
return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed"));
}
// Parse the JSON result
let extraction: ExtractionResult = serde_json::from_value(result.clone())
.map_err(|e| {
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
})?;
match extraction.status.as_str() {
"found" => {
if let Some(ticker) = extraction.ticker {
if let Some(ref metadata) = extraction.metadata {
logger::log_info(&format!(
"Selected row {} with {} valid fields out of {} total rows",
metadata.selected_row_index,
metadata.valid_field_count,
metadata.total_rows
)).await;
}
Ok(Some(YahooCompanyDetails {
ticker,
sector: extraction.sector,
exchange: extraction.exchange,
}))
} else {
Err(anyhow!("Status 'found' but no ticker present"))
}
},
"no_results" => Ok(None),
"error" => {
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
Err(anyhow!("JavaScript extraction error: {}", error_msg))
},
_ => Ok(None),
}
}
pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::Result<Vec<String>> {
let corporate_path = paths.data_dir().join("corporate").join("by_name");
let companies_file = corporate_path.join("companies.jsonl");
let content = tokio::fs::read_to_string(companies_file).await?;
let mut tickers = Vec::new();
for line in content.lines() {
let company: CompanyCrossPlatformInfo = serde_json::from_str(line)?;
for (_isin, ticker_vec) in company.isin_tickers_map {
tickers.extend(ticker_vec);
}
}
Ok(tickers)
}
pub async fn fetch_earnings_with_pool(
pool: &Arc<ChromeDriverPool>,
ticker: &str,
) -> anyhow::Result<Vec<CompanyEvent>> {
let ticker = ticker.to_string();
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker);
let ticker_cloned = ticker.clone();
pool.execute(url, move |client| {
let ticker = ticker_cloned.clone();
Box::pin(async move {
reject_yahoo_cookies(&client).await?;
extract_earnings_events(&client, &ticker).await
})
}).await
}
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
// Wait for the table to load
let table = client
.wait()
.for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
.await
.map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
// Find all rows in tbody
let rows = table
.find_all(Locator::Css("tbody tr"))
.await
.map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
let mut events = Vec::with_capacity(rows.len());
for row in rows {
let cells = row
.find_all(Locator::Css("td"))
.await
.map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
if cells.len() < 5 {
continue; // Skip incomplete rows
}
// Extract and parse date
let date_str = cells[0]
.text()
.await
.map_err(|e| anyhow!("Failed to get date text: {}", e))?;
let date = parse_yahoo_date(&date_str)
.map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
.format("%Y-%m-%d")
.to_string();
// Extract time, replace "Time Not Supplied" with empty
let time = cells[1]
.text()
.await
.map_err(|e| anyhow!("Failed to get time text: {}", e))?
.replace("Time Not Supplied", "");
// Extract period
let period = cells[2]
.text()
.await
.map_err(|e| anyhow!("Failed to get period text: {}", e))?;
// Parse EPS forecast
let eps_forecast_str = cells[3]
.text()
.await
.map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
let eps_forecast = parse_float(&eps_forecast_str);
// Parse EPS actual
let eps_actual_str = cells[4]
.text()
.await
.map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
let eps_actual = parse_float(&eps_actual_str);
// Parse surprise % if available
let surprise_pct = if cells.len() > 5 {
let surprise_str = cells[5]
.text()
.await
.map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
parse_float(&surprise_str)
} else {
None
};
events.push(CompanyEvent {
ticker: ticker.to_string(),
date,
time,
period,
eps_forecast,
eps_actual,
revenue_forecast: None,
revenue_actual: None,
surprise_pct,
source: "Yahoo".to_string(),
});
}
if events.is_empty() {
logger::log_warn(&format!("Warning: No earnings events extracted for ticker {}", ticker)).await;
} else {
logger::log_info(&format!("Extracted {} earnings events for {}", events.len(), ticker)).await;
}
Ok(events)
}
/// Rejecting Yahoo Cookies
async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
for _ in 0..10 {
let clicked: bool = client
.execute(
r#"(() => {
const btn = document.querySelector('#consent-page .reject-all');
if (btn) {
btn.click();
return true;
}
return false;
})()"#,
vec![],
)
.await?
.as_bool()
.unwrap_or(false);
if clicked { break; }
sleep(TokioDuration::from_millis(500)).await;
}
logger::log_info("Rejected Yahoo cookies if button existed").await;
Ok(())
}