added parallelized scraping instances for company yahoo ticker seeding
This commit is contained in:
@@ -4,7 +4,7 @@ use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
||||
use event_backtest_engine::logger;
|
||||
use fantoccini::{Client, Locator};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::{time::{Duration as TokioDuration, sleep}};
|
||||
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
||||
use std::{sync::Arc};
|
||||
use anyhow::{anyhow, Result};
|
||||
|
||||
@@ -21,6 +21,16 @@ pub enum YahooTickerResult {
|
||||
AmbiguousResults,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ExtractionMetadata {
|
||||
#[serde(rename = "selectedRowIndex")]
|
||||
pub selected_row_index: usize,
|
||||
#[serde(rename = "validFieldCount")]
|
||||
pub valid_field_count: usize,
|
||||
#[serde(rename = "totalRows")]
|
||||
pub total_rows: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ExtractionResult {
|
||||
status: String,
|
||||
@@ -29,6 +39,8 @@ pub struct ExtractionResult {
|
||||
exchange: Option<String>,
|
||||
#[serde(default)]
|
||||
error_message: Option<String>,
|
||||
#[serde(default)]
|
||||
metadata: Option<ExtractionMetadata>,
|
||||
}
|
||||
|
||||
impl YahooTickerResult {
|
||||
@@ -73,28 +85,99 @@ pub async fn extract_company_details(
|
||||
client: &Client,
|
||||
_isin: &str,
|
||||
) -> Result<Option<YahooCompanyDetails>> {
|
||||
// Wait for page to load - look for either the table or the no-data element
|
||||
let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
|
||||
TokioDuration::from_secs(30),
|
||||
async {
|
||||
for _ in 0..60 {
|
||||
let has_content: bool = client
|
||||
.execute(
|
||||
r#"
|
||||
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
|
||||
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
|
||||
return !!(table || noData);
|
||||
"#,
|
||||
vec![],
|
||||
)
|
||||
.await
|
||||
.map_err(|e| anyhow!("Execute error: {}", e))?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
|
||||
if has_content {
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
sleep(TokioDuration::from_millis(500)).await;
|
||||
}
|
||||
Ok(false)
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
||||
|
||||
match wait_result {
|
||||
Err(_) => {
|
||||
return Err(anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
||||
},
|
||||
Ok(Err(e)) => {
|
||||
return Err(anyhow!("Error checking page content: {}", e));
|
||||
},
|
||||
Ok(Ok(false)) => {
|
||||
logger::log_warn("Page content not found after waiting, attempting extraction anyway").await;
|
||||
},
|
||||
Ok(Ok(true)) => {
|
||||
logger::log_info("Page content detected, proceeding with extraction").await;
|
||||
}
|
||||
}
|
||||
|
||||
// Execute the JavaScript extraction script
|
||||
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
|
||||
|
||||
// Log the raw result for debugging
|
||||
logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await;
|
||||
|
||||
// Check if result is null
|
||||
if result.is_null() {
|
||||
return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed"));
|
||||
}
|
||||
|
||||
// Parse the JSON result
|
||||
let extraction: ExtractionResult = serde_json::from_value(result)
|
||||
.map_err(|e| anyhow!("Failed to parse extraction result: {}", e))?;
|
||||
let extraction: ExtractionResult = serde_json::from_value(result.clone())
|
||||
.map_err(|e| {
|
||||
// Log the problematic result value for debugging
|
||||
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
|
||||
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
|
||||
})?;
|
||||
|
||||
match extraction.status.as_str() {
|
||||
"found" => {
|
||||
// Ticker is guaranteed to be present when status is "found"
|
||||
// Sector and exchange are optional
|
||||
if let Some(ticker) = extraction.ticker {
|
||||
// Log metadata if available
|
||||
if let Some(ref metadata) = extraction.metadata {
|
||||
logger::log_info(&format!(
|
||||
"Selected row {} with {} valid fields out of {} total rows",
|
||||
metadata.selected_row_index,
|
||||
metadata.valid_field_count,
|
||||
metadata.total_rows
|
||||
)).await;
|
||||
}
|
||||
|
||||
Ok(Some(YahooCompanyDetails {
|
||||
ticker,
|
||||
sector: extraction.sector,
|
||||
exchange: extraction.exchange,
|
||||
}))
|
||||
} else {
|
||||
Ok(None)
|
||||
// This shouldn't happen if JS script is working correctly
|
||||
Err(anyhow!("Status 'found' but no ticker present"))
|
||||
}
|
||||
},
|
||||
"no_results" => Ok(None),
|
||||
"not_found" => Ok(None),
|
||||
"error" => {
|
||||
// Error status means ticker was not found or extraction failed
|
||||
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
|
||||
Err(anyhow!("JavaScript extraction error: {}", error_msg))
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user