added parallelized scraping instances for company yahoo ticker seeding

This commit is contained in:
2025-12-18 13:05:23 +01:00
parent d26e833d93
commit 9c66f0d361
7 changed files with 842 additions and 68 deletions

View File

@@ -4,7 +4,7 @@ use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
use event_backtest_engine::logger;
use fantoccini::{Client, Locator};
use serde::{Deserialize, Serialize};
use tokio::{time::{Duration as TokioDuration, sleep}};
use tokio::time::{Duration as TokioDuration, sleep, timeout};
use std::{sync::Arc};
use anyhow::{anyhow, Result};
@@ -21,6 +21,16 @@ pub enum YahooTickerResult {
AmbiguousResults,
}
#[derive(Debug, Deserialize)]
pub struct ExtractionMetadata {
#[serde(rename = "selectedRowIndex")]
pub selected_row_index: usize,
#[serde(rename = "validFieldCount")]
pub valid_field_count: usize,
#[serde(rename = "totalRows")]
pub total_rows: usize,
}
#[derive(Debug, Deserialize)]
pub struct ExtractionResult {
status: String,
@@ -29,6 +39,8 @@ pub struct ExtractionResult {
exchange: Option<String>,
#[serde(default)]
error_message: Option<String>,
#[serde(default)]
metadata: Option<ExtractionMetadata>,
}
impl YahooTickerResult {
@@ -73,28 +85,99 @@ pub async fn extract_company_details(
client: &Client,
_isin: &str,
) -> Result<Option<YahooCompanyDetails>> {
// Wait for page to load - look for either the table or the no-data element
let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
TokioDuration::from_secs(30),
async {
for _ in 0..60 {
let has_content: bool = client
.execute(
r#"
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
return !!(table || noData);
"#,
vec![],
)
.await
.map_err(|e| anyhow!("Execute error: {}", e))?
.as_bool()
.unwrap_or(false);
if has_content {
return Ok(true);
}
sleep(TokioDuration::from_millis(500)).await;
}
Ok(false)
},
)
.await
.map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load"));
match wait_result {
Err(_) => {
return Err(anyhow!("Timeout waiting for Yahoo Finance page to load"));
},
Ok(Err(e)) => {
return Err(anyhow!("Error checking page content: {}", e));
},
Ok(Ok(false)) => {
logger::log_warn("Page content not found after waiting, attempting extraction anyway").await;
},
Ok(Ok(true)) => {
logger::log_info("Page content detected, proceeding with extraction").await;
}
}
// Execute the JavaScript extraction script
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
// Log the raw result for debugging
logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await;
// Check if result is null
if result.is_null() {
return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed"));
}
// Parse the JSON result
let extraction: ExtractionResult = serde_json::from_value(result)
.map_err(|e| anyhow!("Failed to parse extraction result: {}", e))?;
let extraction: ExtractionResult = serde_json::from_value(result.clone())
.map_err(|e| {
// Log the problematic result value for debugging
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
})?;
match extraction.status.as_str() {
"found" => {
// Ticker is guaranteed to be present when status is "found"
// Sector and exchange are optional
if let Some(ticker) = extraction.ticker {
// Log metadata if available
if let Some(ref metadata) = extraction.metadata {
logger::log_info(&format!(
"Selected row {} with {} valid fields out of {} total rows",
metadata.selected_row_index,
metadata.valid_field_count,
metadata.total_rows
)).await;
}
Ok(Some(YahooCompanyDetails {
ticker,
sector: extraction.sector,
exchange: extraction.exchange,
}))
} else {
Ok(None)
// This shouldn't happen if JS script is working correctly
Err(anyhow!("Status 'found' but no ticker present"))
}
},
"no_results" => Ok(None),
"not_found" => Ok(None),
"error" => {
// Error status means ticker was not found or extraction failed
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
Err(anyhow!("JavaScript extraction error: {}", error_msg))
},