454 lines
15 KiB
Rust
454 lines
15 KiB
Rust
// src/corporate/yahoo.rs
|
|
use super::{types::*, helpers::*, page_validation::*};
|
|
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
|
use crate::logger;
|
|
use fantoccini::{Client, Locator};
|
|
use rand::Rng;
|
|
use serde::{Deserialize, Serialize};
|
|
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
|
use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}};
|
|
use anyhow::{anyhow, Result};
|
|
|
|
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub enum YahooTickerResult {
|
|
Found(String),
|
|
NotFound,
|
|
NoResults,
|
|
AmbiguousResults,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
pub struct ExtractionMetadata {
|
|
#[serde(rename = "selectedRowIndex")]
|
|
pub selected_row_index: usize,
|
|
#[serde(rename = "validFieldCount")]
|
|
pub valid_field_count: usize,
|
|
#[serde(rename = "totalRows")]
|
|
pub total_rows: usize,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
pub struct ExtractionResult {
|
|
status: String,
|
|
ticker: Option<String>,
|
|
sector: Option<String>,
|
|
exchange: Option<String>,
|
|
#[serde(default)]
|
|
error_message: Option<String>,
|
|
#[serde(default)]
|
|
metadata: Option<ExtractionMetadata>,
|
|
}
|
|
|
|
impl YahooTickerResult {
|
|
pub fn to_tagged_string(&self) -> String {
|
|
match self {
|
|
YahooTickerResult::Found(ticker) => format!("YAHOO:{}", ticker),
|
|
YahooTickerResult::NotFound => "YAHOO:NOT_FOUND".to_string(),
|
|
YahooTickerResult::NoResults => "YAHOO:NO_RESULTS".to_string(),
|
|
YahooTickerResult::AmbiguousResults => "YAHOO:AMBIGUOUS".to_string(),
|
|
}
|
|
}
|
|
|
|
pub fn is_found(&self) -> bool {
|
|
matches!(self, YahooTickerResult::Found(_))
|
|
}
|
|
|
|
pub fn get_ticker(&self) -> Option<&str> {
|
|
match self {
|
|
YahooTickerResult::Found(ticker) => Some(ticker),
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// UPDATED: Scrape company details with full validation and shutdown support
|
|
pub async fn scrape_company_details_by_isin(
|
|
pool: &Arc<ChromeDriverPool>,
|
|
isin: &str,
|
|
shutdown_flag: &Arc<AtomicBool>,
|
|
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
|
// Check shutdown before starting
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
|
|
return Ok(None);
|
|
}
|
|
|
|
if pool.should_perform_hard_reset() {
|
|
logger::log_warn("HARD_RESET_REQUIRED detected before starting ISIN scrape").await;
|
|
return Err(anyhow!("HARD_RESET_REQUIRED"));
|
|
}
|
|
|
|
let isin_owned = isin.to_string();
|
|
let shutdown_clone = Arc::clone(shutdown_flag);
|
|
let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
|
|
|
|
pool.execute(url.clone(), move |client| {
|
|
let isin = isin_owned.clone();
|
|
let shutdown = shutdown_clone.clone();
|
|
|
|
Box::pin(async move {
|
|
// Check shutdown during task execution
|
|
if shutdown.load(Ordering::SeqCst) {
|
|
return Err(anyhow!("Task aborted due to shutdown"));
|
|
}
|
|
|
|
// Random delay
|
|
let delay = rand::rng().random_range(800..1500);
|
|
sleep(TokioDuration::from_millis(delay)).await;
|
|
|
|
// Reject cookies
|
|
reject_yahoo_cookies(&client).await?;
|
|
|
|
// Check shutdown again
|
|
if shutdown.load(Ordering::SeqCst) {
|
|
return Err(anyhow!("Task aborted due to shutdown"));
|
|
}
|
|
|
|
// CRITICAL: Validate navigation succeeded
|
|
let expected_fragment = format!("lookup/?s={}", isin);
|
|
match verify_navigation(&client, &expected_fragment, 5).await {
|
|
Ok(_) => {
|
|
logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await;
|
|
}
|
|
Err(e) => {
|
|
logger::log_error(&format!(
|
|
"Navigation verification failed for ISIN {}: {}",
|
|
isin, e
|
|
)).await;
|
|
// Clear browser state before returning error
|
|
clear_browser_state(&client).await.ok();
|
|
return Err(e);
|
|
}
|
|
}
|
|
|
|
// Additional content validation
|
|
let page_ready: bool = client
|
|
.execute(
|
|
r#"
|
|
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
|
|
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
|
|
return !!(table || noData);
|
|
"#,
|
|
vec![],
|
|
)
|
|
.await?
|
|
.as_bool()
|
|
.unwrap_or(false);
|
|
|
|
if !page_ready {
|
|
logger::log_error(&format!(
|
|
"Page content not ready for ISIN {} - neither table nor no-data element found",
|
|
isin
|
|
)).await;
|
|
clear_browser_state(&client).await.ok();
|
|
return Err(anyhow!("Page content not ready"));
|
|
}
|
|
|
|
logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await;
|
|
|
|
// Check shutdown before extraction
|
|
if shutdown.load(Ordering::SeqCst) {
|
|
return Err(anyhow!("Task aborted due to shutdown"));
|
|
}
|
|
|
|
// Random delay before extraction
|
|
let delay = rand::rng().random_range(800..1500);
|
|
sleep(TokioDuration::from_millis(delay)).await;
|
|
|
|
// Now safe to extract
|
|
extract_company_details_validated(&client, &isin).await
|
|
})
|
|
}).await
|
|
}
|
|
|
|
/// UPDATED: Extract with additional URL validation
|
|
async fn extract_company_details_validated(
|
|
client: &Client,
|
|
isin: &str,
|
|
) -> Result<Option<YahooCompanyDetails>> {
|
|
// Double-check URL is still correct before extraction
|
|
let current_url = client.current_url().await?;
|
|
if !current_url.as_str().contains(isin) {
|
|
logger::log_error(&format!(
|
|
"URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'",
|
|
isin,
|
|
current_url.as_str()
|
|
)).await;
|
|
clear_browser_state(client).await.ok();
|
|
return Err(anyhow!("URL mismatch - possible stale page"));
|
|
}
|
|
|
|
// Run extraction
|
|
let result = extract_company_details(client, isin).await?;
|
|
|
|
// Validate extraction result
|
|
if let Some(ref details) = result {
|
|
logger::log_info(&format!(
|
|
"✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})",
|
|
details.ticker, isin, details.sector, details.exchange
|
|
)).await;
|
|
} else {
|
|
logger::log_info(&format!(
|
|
"No ticker found for ISIN {} (legitimately not found)",
|
|
isin
|
|
)).await;
|
|
}
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
pub async fn extract_company_details(
|
|
client: &Client,
|
|
_isin: &str,
|
|
) -> Result<Option<YahooCompanyDetails>> {
|
|
// Wait for page to load - look for either the table or the no-data element
|
|
let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
|
|
TokioDuration::from_secs(30),
|
|
async {
|
|
for _ in 0..60 {
|
|
let has_content: bool = client
|
|
.execute(
|
|
r#"
|
|
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
|
|
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
|
|
return !!(table || noData);
|
|
"#,
|
|
vec![],
|
|
)
|
|
.await
|
|
.map_err(|e| anyhow!("Execute error: {}", e))?
|
|
.as_bool()
|
|
.unwrap_or(false);
|
|
|
|
if has_content {
|
|
return Ok(true);
|
|
}
|
|
|
|
sleep(TokioDuration::from_millis(500)).await;
|
|
}
|
|
Ok(false)
|
|
},
|
|
)
|
|
.await
|
|
.map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
|
|
|
match wait_result {
|
|
Err(_) => {
|
|
return Err(anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
|
},
|
|
Ok(Err(e)) => {
|
|
return Err(anyhow!("Error checking page content: {}", e));
|
|
},
|
|
Ok(Ok(false)) => {
|
|
logger::log_warn("Page content not found after waiting, attempting extraction anyway").await;
|
|
},
|
|
Ok(Ok(true)) => {
|
|
logger::log_info("Page content detected, proceeding with extraction").await;
|
|
}
|
|
}
|
|
|
|
// Execute the JavaScript extraction script
|
|
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
|
|
|
|
// Log the raw result for debugging
|
|
logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await;
|
|
|
|
// Check if result is null
|
|
if result.is_null() {
|
|
return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed"));
|
|
}
|
|
|
|
// Parse the JSON result
|
|
let extraction: ExtractionResult = serde_json::from_value(result.clone())
|
|
.map_err(|e| {
|
|
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
|
|
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
|
|
})?;
|
|
|
|
match extraction.status.as_str() {
|
|
"found" => {
|
|
if let Some(ticker) = extraction.ticker {
|
|
if let Some(ref metadata) = extraction.metadata {
|
|
logger::log_info(&format!(
|
|
"Selected row {} with {} valid fields out of {} total rows",
|
|
metadata.selected_row_index,
|
|
metadata.valid_field_count,
|
|
metadata.total_rows
|
|
)).await;
|
|
}
|
|
|
|
Ok(Some(YahooCompanyDetails {
|
|
ticker,
|
|
sector: extraction.sector,
|
|
exchange: extraction.exchange,
|
|
}))
|
|
} else {
|
|
Err(anyhow!("Status 'found' but no ticker present"))
|
|
}
|
|
},
|
|
"no_results" => Ok(None),
|
|
"error" => {
|
|
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
|
|
Err(anyhow!("JavaScript extraction error: {}", error_msg))
|
|
},
|
|
_ => Ok(None),
|
|
}
|
|
}
|
|
|
|
pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::Result<Vec<String>> {
|
|
let corporate_path = paths.data_dir().join("corporate").join("by_name");
|
|
let companies_file = corporate_path.join("companies.jsonl");
|
|
let content = tokio::fs::read_to_string(companies_file).await?;
|
|
let mut tickers = Vec::new();
|
|
for line in content.lines() {
|
|
let company: CompanyCrossPlatformInfo = serde_json::from_str(line)?;
|
|
for (_isin, ticker_vec) in company.isin_tickers_map {
|
|
tickers.extend(ticker_vec);
|
|
}
|
|
}
|
|
Ok(tickers)
|
|
}
|
|
|
|
pub async fn fetch_earnings_with_pool(
|
|
pool: &Arc<ChromeDriverPool>,
|
|
ticker: &str,
|
|
) -> anyhow::Result<Vec<CompanyEvent>> {
|
|
let ticker = ticker.to_string();
|
|
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker);
|
|
|
|
let ticker_cloned = ticker.clone();
|
|
|
|
pool.execute(url, move |client| {
|
|
let ticker = ticker_cloned.clone();
|
|
Box::pin(async move {
|
|
reject_yahoo_cookies(&client).await?;
|
|
extract_earnings_events(&client, &ticker).await
|
|
})
|
|
}).await
|
|
}
|
|
|
|
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
|
|
// Wait for the table to load
|
|
let table = client
|
|
.wait()
|
|
.for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
|
|
|
|
// Find all rows in tbody
|
|
let rows = table
|
|
.find_all(Locator::Css("tbody tr"))
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
|
|
|
|
let mut events = Vec::with_capacity(rows.len());
|
|
|
|
for row in rows {
|
|
let cells = row
|
|
.find_all(Locator::Css("td"))
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
|
|
|
|
if cells.len() < 5 {
|
|
continue; // Skip incomplete rows
|
|
}
|
|
|
|
// Extract and parse date
|
|
let date_str = cells[0]
|
|
.text()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to get date text: {}", e))?;
|
|
let date = parse_yahoo_date(&date_str)
|
|
.map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
|
|
.format("%Y-%m-%d")
|
|
.to_string();
|
|
|
|
// Extract time, replace "Time Not Supplied" with empty
|
|
let time = cells[1]
|
|
.text()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to get time text: {}", e))?
|
|
.replace("Time Not Supplied", "");
|
|
|
|
// Extract period
|
|
let period = cells[2]
|
|
.text()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to get period text: {}", e))?;
|
|
|
|
// Parse EPS forecast
|
|
let eps_forecast_str = cells[3]
|
|
.text()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
|
|
let eps_forecast = parse_float(&eps_forecast_str);
|
|
|
|
// Parse EPS actual
|
|
let eps_actual_str = cells[4]
|
|
.text()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
|
|
let eps_actual = parse_float(&eps_actual_str);
|
|
|
|
// Parse surprise % if available
|
|
let surprise_pct = if cells.len() > 5 {
|
|
let surprise_str = cells[5]
|
|
.text()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
|
|
parse_float(&surprise_str)
|
|
} else {
|
|
None
|
|
};
|
|
|
|
events.push(CompanyEvent {
|
|
ticker: ticker.to_string(),
|
|
date,
|
|
time,
|
|
period,
|
|
eps_forecast,
|
|
eps_actual,
|
|
revenue_forecast: None,
|
|
revenue_actual: None,
|
|
surprise_pct,
|
|
source: "Yahoo".to_string(),
|
|
});
|
|
}
|
|
|
|
if events.is_empty() {
|
|
logger::log_warn(&format!("Warning: No earnings events extracted for ticker {}", ticker)).await;
|
|
} else {
|
|
logger::log_info(&format!("Extracted {} earnings events for {}", events.len(), ticker)).await;
|
|
}
|
|
|
|
Ok(events)
|
|
}
|
|
|
|
/// Rejecting Yahoo Cookies
|
|
async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
|
|
for _ in 0..10 {
|
|
let clicked: bool = client
|
|
.execute(
|
|
r#"(() => {
|
|
const btn = document.querySelector('#consent-page .reject-all');
|
|
if (btn) {
|
|
btn.click();
|
|
return true;
|
|
}
|
|
return false;
|
|
})()"#,
|
|
vec![],
|
|
)
|
|
.await?
|
|
.as_bool()
|
|
.unwrap_or(false);
|
|
|
|
if clicked { break; }
|
|
sleep(TokioDuration::from_millis(500)).await;
|
|
}
|
|
|
|
logger::log_info("Rejected Yahoo cookies if button existed").await;
|
|
Ok(())
|
|
} |