added parallelized scraping instances for company yahoo ticker seeding

2025-12-18 13:05:23 +01:00
parent d26e833d93
commit 9c66f0d361
7 changed files with 842 additions and 68 deletions
--- a/src/corporate/yahoo.rs
+++ b/src/corporate/yahoo.rs
@@ -4,7 +4,7 @@ use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
 use event_backtest_engine::logger;
 use fantoccini::{Client, Locator};
 use serde::{Deserialize, Serialize};
-use tokio::{time::{Duration as TokioDuration, sleep}};
+use tokio::time::{Duration as TokioDuration, sleep, timeout};
 use std::{sync::Arc};
 use anyhow::{anyhow, Result};

@@ -21,6 +21,16 @@ pub enum YahooTickerResult {
    AmbiguousResults,
 }

+#[derive(Debug, Deserialize)]
+pub struct ExtractionMetadata {
+    #[serde(rename = "selectedRowIndex")]
+    pub selected_row_index: usize,
+    #[serde(rename = "validFieldCount")]
+    pub valid_field_count: usize,
+    #[serde(rename = "totalRows")]
+    pub total_rows: usize,
+}
+
 #[derive(Debug, Deserialize)]
 pub struct ExtractionResult {
    status: String,
@@ -29,6 +39,8 @@ pub struct ExtractionResult {
    exchange: Option<String>,
    #[serde(default)]
    error_message: Option<String>,
+    #[serde(default)]
+    metadata: Option<ExtractionMetadata>,
 }

 impl YahooTickerResult {
@@ -73,28 +85,99 @@ pub async fn extract_company_details(
    client: &Client,
    _isin: &str,
 ) -> Result<Option<YahooCompanyDetails>> {
+    // Wait for page to load - look for either the table or the no-data element
+    let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
+        TokioDuration::from_secs(30),
+        async {
+            for _ in 0..60 {
+                let has_content: bool = client
+                    .execute(
+                        r#"
+                        const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
+                        const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
+                        return !!(table || noData);
+                        "#,
+                        vec![],
+                    )
+                    .await
+                    .map_err(|e| anyhow!("Execute error: {}", e))?
+                    .as_bool()
+                    .unwrap_or(false);
+
+                if has_content {
+                    return Ok(true);
+                }
+
+                sleep(TokioDuration::from_millis(500)).await;
+            }
+            Ok(false)
+        },
+    )
+    .await
+    .map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load"));
+    
+    match wait_result {
+        Err(_) => {
+            return Err(anyhow!("Timeout waiting for Yahoo Finance page to load"));
+        },
+        Ok(Err(e)) => {
+            return Err(anyhow!("Error checking page content: {}", e));
+        },
+        Ok(Ok(false)) => {
+            logger::log_warn("Page content not found after waiting, attempting extraction anyway").await;
+        },
+        Ok(Ok(true)) => {
+            logger::log_info("Page content detected, proceeding with extraction").await;
+        }
+    }
+    
    // Execute the JavaScript extraction script
    let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
    
+    // Log the raw result for debugging
+    logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await;
+    
+    // Check if result is null
+    if result.is_null() {
+        return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed"));
+    }
+    
    // Parse the JSON result
-    let extraction: ExtractionResult = serde_json::from_value(result)
-        .map_err(|e| anyhow!("Failed to parse extraction result: {}", e))?;
+    let extraction: ExtractionResult = serde_json::from_value(result.clone())
+        .map_err(|e| {
+            // Log the problematic result value for debugging
+            let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
+            anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
+        })?;
    
    match extraction.status.as_str() {
        "found" => {
+            // Ticker is guaranteed to be present when status is "found"
+            // Sector and exchange are optional
            if let Some(ticker) = extraction.ticker {
+                // Log metadata if available
+                if let Some(ref metadata) = extraction.metadata {
+                    logger::log_info(&format!(
+                        "Selected row {} with {} valid fields out of {} total rows",
+                        metadata.selected_row_index,
+                        metadata.valid_field_count,
+                        metadata.total_rows
+                    )).await;
+                }
+                
                Ok(Some(YahooCompanyDetails {
                    ticker,
                    sector: extraction.sector,
                    exchange: extraction.exchange,
                }))
            } else {
-                Ok(None)
+                // This shouldn't happen if JS script is working correctly
+                Err(anyhow!("Status 'found' but no ticker present"))
            }
        },
        "no_results" => Ok(None),
-        "not_found" => Ok(None),
        "error" => {
+            // Error status means ticker was not found or extraction failed
            let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
            Err(anyhow!("JavaScript extraction error: {}", error_msg))
        },