readded yahoo

This commit is contained in:
2026-01-05 17:00:42 +01:00
parent 86944a9c58
commit 3d16475b79
14 changed files with 2717 additions and 49 deletions

7
Cargo.lock generated
View File

@@ -3454,6 +3454,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "urlencoding"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]] [[package]]
name = "utf-8" name = "utf-8"
version = "0.7.6" version = "0.7.6"
@@ -3631,6 +3637,7 @@ dependencies = [
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"url", "url",
"urlencoding",
"walkdir", "walkdir",
"yfinance-rs", "yfinance-rs",
"zip", "zip",

View File

@@ -17,11 +17,12 @@ categories = ["finance", "data-structures", "asynchronous"]
tokio = { version = "1.38", features = ["full"] } tokio = { version = "1.38", features = ["full"] }
# Web scraping & HTTP # Web scraping & HTTP
reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking"] } reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking", "socks", "cookies"] }
scraper = "0.19" # HTML parsing for Yahoo earnings pages scraper = "0.19" # HTML parsing for Yahoo earnings pages
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
yfinance-rs = "0.7.2" yfinance-rs = "0.7.2"
url = "2.5.7" url = "2.5.7"
urlencoding = "2.1"
# Serialization # Serialization
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }

View File

@@ -1,15 +1,55 @@
// src/corporate/update.rs - UPDATED WITH DATA INTEGRITY FIXES // src/corporate/update.rs - WITH ABORT-SAFE INCREMENTAL PERSISTENCE
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*}; use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, atomic_writer::*};
use crate::config::Config; use crate::config::Config;
use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel; use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel;
use crate::util::directories::DataPaths; use crate::util::directories::DataPaths;
use crate::util::logger; use crate::util::logger;
use crate::scraper::webdriver::ChromeDriverPool; use crate::scraper::webdriver::ChromeDriverPool;
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
use chrono::Local; use std::result::Result::Ok;
use chrono::{Local, Utc};
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::fs::{File, OpenOptions};
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use futures::stream::{FuturesUnordered, StreamExt};
use serde_json::json;
use serde::{Deserialize, Serialize};
use tokio::sync::mpsc;
use std::time::Duration;
/// Result of processing a single company
#[derive(Debug, Clone)]
pub enum CompanyProcessResult {
Valid(CompanyCrossPlatformInfo),
FilteredLowCap { name: String, market_cap: f64 },
FilteredNoPrice { name: String },
Failed { company: CompanyCrossPlatformInfo, error: String, is_transient: bool },
}
/// Represents a write command to be serialized through the log writer
enum LogCommand {
Write(CompanyCrossPlatformInfo),
Checkpoint,
Shutdown,
}
/// Result from processing a single company with priority
struct CompanyTaskResult {
company: CompanyCrossPlatformInfo,
result: CompanyProcessResult,
}
/// Check if a company needs processing (validation check)
fn company_needs_processing(
company: &CompanyCrossPlatformInfo,
existing_companies: &HashMap<String, CompanyCrossPlatformInfo>,
) -> bool {
// If company exists in cleaned output, skip it
!existing_companies.contains_key(&company.name)
}
/// Main corporate update entry point with shutdown awareness /// Main corporate update entry point with shutdown awareness
pub async fn run_full_update( pub async fn run_full_update(
@@ -87,47 +127,46 @@ pub async fn run_full_update(
} }
logger::log_info("Step 6: Cleansing up companies with missing essential data...").await; logger::log_info("Step 6: Cleansing up companies with missing essential data...").await;
let cleansed_count = companies_yahoo_jsonl(&paths).await?; let cleansed_count = companies_yahoo_cleansed_no_data(&paths).await?;
logger::log_info(&format!("{} companies found on Yahoo ready for further use in companies_yahoo.jsonl", cleansed_count)).await; logger::log_info(&format!("{} companies found on Yahoo ready for further use in companies_yahoo.jsonl", cleansed_count)).await;
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected after companies.jsonl build").await;
return Ok(());
}
logger::log_info("Step 7: Cleansing up companies with too low profile (with abort-safe persistence)...").await;
let proxy_pool = pool.get_proxy_pool()
.ok_or_else(|| anyhow::anyhow!("ChromeDriverPool must be created with VPN proxy rotation enabled"))?;
let cleansed_count = companies_yahoo_cleansed_low_profile(&paths, _config, proxy_pool, shutdown_flag).await?;
logger::log_info(&format!("{} companies with sufficient profile ready for analytics", cleansed_count)).await;
if !shutdown_flag.load(Ordering::SeqCst) { if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 7: Processing events (using index)...").await; logger::log_info("Step 8: Processing events (using index)...").await;
let _event_index = build_event_index(&paths).await?; let _event_index = build_event_index(&paths).await?;
logger::log_info(" ✓ Event index built").await; logger::log_info(" ✓ Event index built").await;
} else { } else {
logger::log_warn("Shutdown detected, skipping event index build").await; logger::log_warn("Shutdown detected, skipping event index build").await;
} }
logger::log_info(" Corporate update complete").await; logger::log_info(" Corporate update complete").await;
Ok(()) Ok(())
} }
/// Cleansing function to remove companies with missing essential yahoo data for integrity /// Cleansing function to remove companies with missing essential yahoo data for integrity
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' and 'YAHOO:ERROR' are removed pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize, anyhow::Error> {
/// The rest stays unchanged
///
/// Uses state.jsonl to track completion and avoid re-running the cleansing operation
/// The '.jsonl' will be saved in the same path but 'companies_yahoo.jsonl'
/// Only execute when 'companies.jsonl' is present
pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
use tokio::fs::File;
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use serde_json::json;
let data_path = paths.data_dir(); let data_path = paths.data_dir();
let input_path = data_path.join("companies.jsonl"); let input_path = data_path.join("companies.jsonl");
let output_path = data_path.join("companies_yahoo.jsonl"); let output_path = data_path.join("companies_yahoo.jsonl");
let state_path = data_path.join("state.jsonl"); let state_path = data_path.join("state.jsonl");
// Check if input file exists
if !input_path.exists() { if !input_path.exists() {
logger::log_warn("companies.jsonl not found, skipping cleansing").await; logger::log_warn("companies.jsonl not found, skipping cleansing").await;
return Ok(0); return Ok(0);
} }
// Check if state file exists and cleansing was already completed
if state_path.exists() { if state_path.exists() {
let state_content = tokio::fs::read_to_string(&state_path).await?; let state_content = tokio::fs::read_to_string(&state_path).await?;
@@ -137,10 +176,9 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
} }
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) { if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
if state.get("yahoo_companies").and_then(|v| v.as_bool()).unwrap_or(false) { if state.get("yahoo_companies_cleansed_no_data").and_then(|v| v.as_bool()).unwrap_or(false) {
logger::log_info(" Yahoo companies cleansing already completed, reading existing file...").await; logger::log_info(" Yahoo companies cleansing already completed, reading existing file...").await;
// Count lines in existing output file
if output_path.exists() { if output_path.exists() {
let output_content = tokio::fs::read_to_string(&output_path).await?; let output_content = tokio::fs::read_to_string(&output_path).await?;
let count = output_content.lines() let count = output_content.lines()
@@ -185,8 +223,6 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
} }
}; };
// Check if company has at least one valid YAHOO ticker
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS" or "YAHOO:ERROR"
let has_valid_yahoo = company.isin_tickers_map let has_valid_yahoo = company.isin_tickers_map
.values() .values()
.flatten() .flatten()
@@ -197,7 +233,6 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
}); });
if has_valid_yahoo { if has_valid_yahoo {
// Write the company to the filtered output
let json_line = serde_json::to_string(&company)?; let json_line = serde_json::to_string(&company)?;
output_file.write_all(json_line.as_bytes()).await?; output_file.write_all(json_line.as_bytes()).await?;
output_file.write_all(b"\n").await?; output_file.write_all(b"\n").await?;
@@ -205,12 +240,10 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
} else { } else {
removed_count += 1; removed_count += 1;
if removed_count <= 5 { if removed_count <= 5 {
// Log first few removals for debugging
logger::log_info(&format!(" Removed company '{}' (no valid Yahoo ticker)", company.name)).await; logger::log_info(&format!(" Removed company '{}' (no valid Yahoo ticker)", company.name)).await;
} }
} }
// Progress indicator for large files
if total_count % 1000 == 0 { if total_count % 1000 == 0 {
logger::log_info(&format!(" Processed {} companies...", total_count)).await; logger::log_info(&format!(" Processed {} companies...", total_count)).await;
} }
@@ -223,9 +256,8 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
total_count, valid_count, removed_count total_count, valid_count, removed_count
)).await; )).await;
// Write state file to mark completion
let yahoo_companies = json!({ let yahoo_companies = json!({
"yahoo_companies": true, "yahoo_companies_cleansed_no_data": true,
"completed_at": chrono::Utc::now().to_rfc3339(), "completed_at": chrono::Utc::now().to_rfc3339(),
}); });
@@ -240,6 +272,768 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
Ok(valid_count) Ok(valid_count)
} }
/// Yahoo Low Profile Cleansing WITH ABORT-SAFE INCREMENTAL PERSISTENCE
///
/// # Features
/// - ✅ Graceful shutdown (abort-safe)
/// - ✅ Task panic isolation (tasks fail independently)
/// - ✅ Crash-safe persistence (checkpoint + log with fsync)
/// - ✅ Smart skip logic (only process incomplete data)
/// - Uses pending queue instead of retry mechanism
/// - Reuses companies_update.log for persistence
///
/// # Persistence Strategy
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
/// - Log: companies_update.log (append-only updates)
/// - On restart: Load checkpoint + replay log
/// - Periodic checkpoints (every 50 companies)
/// - Batched fsync (every 10 writes or 10 seconds)
pub async fn companies_yahoo_cleansed_low_profile(
paths: &DataPaths,
config: &Config,
proxy_pool: Arc<crate::scraper::docker_vpn_proxy::DockerVpnProxyPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 50; // Limit parallel validation tasks
let data_path = paths.data_dir();
// File paths (reusing companies_update.log)
let input_path = data_path.join("companies_yahoo.jsonl");
let checkpoint_path = data_path.join("companies_yahoo_cleaned.jsonl");
let log_path = data_path.join("companies_update.log");
// Check input exists
if !input_path.exists() {
logger::log_warn(" companies_yahoo.jsonl not found, skipping low profile cleansing").await;
return Ok(0);
}
// === RECOVERY PHASE: Load checkpoint + replay log ===
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
if checkpoint_path.exists() {
logger::log_info("Loading checkpoint from companies_yahoo_cleaned.jsonl...").await;
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
for line in checkpoint_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
}
}
}
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
}
if log_path.exists() {
logger::log_info("Replaying update log...").await;
let log_content = tokio::fs::read_to_string(&log_path).await?;
let mut replayed = 0;
for line in log_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
replayed += 1;
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
if replayed > 0 {
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
}
}
// === LOAD INPUT COMPANIES ===
logger::log_info(&format!("Loading companies from: {:?}", input_path)).await;
let input_companies = load_companies_from_jsonl(&input_path).await?;
logger::log_info(&format!("Loaded {} companies from input", input_companies.len())).await;
// === BUILD PENDING LIST (smart skip logic) ===
let mut pending: Vec<CompanyCrossPlatformInfo> = input_companies
.into_iter()
.filter(|company| company_needs_processing(company, &existing_companies))
.collect();
logger::log_info(&format!(
"Initial scan: {} companies need processing ({} already complete)",
pending.len(),
existing_companies.len()
)).await;
if pending.is_empty() {
logger::log_info(" ✓ All companies already processed").await;
return Ok(existing_companies.len());
}
// === SETUP LOG WRITER TASK ===
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
let log_file_init = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await?;
let checkpoint_path_clone = checkpoint_path.clone();
let log_path_clone = log_path.clone();
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
let existing_companies_writer_for_task = Arc::clone(&existing_companies_writer);
let write_tx_for_writer = write_tx.clone();
let writer_task = tokio::spawn(async move {
let mut log_file = log_file_init;
let mut writes_since_fsync = 0;
let mut last_fsync = std::time::Instant::now();
let mut updates_since_checkpoint = 0;
let mut count = 0;
let mut new_count = 0;
let mut updated_count = 0;
while let Some(cmd) = write_rx.recv().await {
match cmd {
LogCommand::Write(company) => {
// Write to log
let line = serde_json::to_string(&company).unwrap();
if let Err(e) = log_file.write_all(line.as_bytes()).await {
logger::log_error(&format!("Failed to write to log: {}", e)).await;
break;
}
if let Err(e) = log_file.write_all(b"\n").await {
logger::log_error(&format!("Failed to write newline: {}", e)).await;
break;
}
writes_since_fsync += 1;
updates_since_checkpoint += 1;
count += 1;
// Update in-memory state
let mut existing_companies = existing_companies_writer_for_task.lock().await;
let is_update = existing_companies.contains_key(&company.name);
existing_companies.insert(company.name.clone(), company);
drop(existing_companies);
if is_update {
updated_count += 1;
} else {
new_count += 1;
}
// Batched + time-based fsync
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
if should_fsync {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync: {}", e)).await;
break;
}
writes_since_fsync = 0;
last_fsync = std::time::Instant::now();
}
}
LogCommand::Checkpoint => {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush before checkpoint: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync before checkpoint: {}", e)).await;
break;
}
let existing_companies = existing_companies_writer_for_task.lock().await;
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
drop(existing_companies);
let temp_path = checkpoint_path_clone.with_extension("tmp");
match tokio::fs::File::create(&temp_path).await {
Ok(mut temp_file) => {
let mut checkpoint_ok = true;
for company in &companies_vec {
if let Ok(line) = serde_json::to_string(company) {
if temp_file.write_all(line.as_bytes()).await.is_err() ||
temp_file.write_all(b"\n").await.is_err() {
checkpoint_ok = false;
break;
}
}
}
if checkpoint_ok {
if temp_file.flush().await.is_ok() &&
temp_file.sync_data().await.is_ok() {
drop(temp_file);
if tokio::fs::rename(&temp_path, &checkpoint_path_clone).await.is_ok() {
if tokio::fs::remove_file(&log_path_clone).await.is_ok() {
logger::log_info(&format!(
"✓ Checkpoint created ({} companies), log cleared",
companies_vec.len()
)).await;
if let Ok(new_log) = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path_clone)
.await {
log_file = new_log;
}
}
}
}
}
}
Err(e) => {
logger::log_error(&format!("Failed to create checkpoint temp file: {}", e)).await;
}
}
updates_since_checkpoint = 0;
}
LogCommand::Shutdown => {
logger::log_info("Writer shutting down...").await;
break;
}
}
// Periodic checkpoint trigger
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
let _ = write_tx_for_writer.send(LogCommand::Checkpoint).await;
}
}
// Final fsync
let _ = log_file.flush().await;
let _ = log_file.sync_data().await;
logger::log_info(&format!(
"Writer finished: {} total ({} new, {} updated)",
count, new_count, updated_count
)).await;
(count, new_count, updated_count)
});
// === CREATE YAHOO CLIENT POOL ===
logger::log_info("Creating YahooClientPool with proxy rotation...").await;
let yahoo_pool = Arc::new(YahooClientPool::new(proxy_pool, config, None).await?);
logger::log_info(&format!("✓ YahooClientPool ready with {} clients", yahoo_pool.num_clients().await)).await;
// Wrap paths in Arc for safe sharing across tasks
let paths = Arc::new((*paths).clone());
// === MAIN PROCESSING LOOP WITH TASK PANIC ISOLATION ===
let total = pending.len();
let mut tasks = FuturesUnordered::new();
// Counters
let processed = Arc::new(AtomicUsize::new(0));
let valid_count = Arc::new(AtomicUsize::new(0));
let filtered_low_cap = Arc::new(AtomicUsize::new(0));
let filtered_no_price = Arc::new(AtomicUsize::new(0));
let failed_count = Arc::new(AtomicUsize::new(0));
// Spawn initial batch
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
// Process results and spawn new tasks (with task panic isolation)
while let Some(task_result) = tasks.next().await {
// Check for shutdown
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown signal received, stopping processing").await;
break;
}
match task_result {
Ok(Ok(Some(_result))) => {
// Success - spawn next task
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
Ok(Ok(None)) => {
// Filtered or failed - spawn next task
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
Ok(Err(e)) => {
// Processing error
logger::log_error(&format!("Company processing error: {}", e)).await;
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
Err(e) => {
// Task panic (isolated - doesn't crash entire process)
logger::log_error(&format!("Task panic: {}", e)).await;
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
}
}
logger::log_info("Main processing loop completed").await;
// Signal writer to finish
let _ = write_tx.send(LogCommand::Checkpoint).await;
let _ = write_tx.send(LogCommand::Shutdown).await;
drop(write_tx);
// Wait for writer to finish
let (final_count, final_new, final_updated) = writer_task.await
.unwrap_or((0, 0, 0));
let final_valid = valid_count.load(Ordering::SeqCst);
let final_filtered_low_cap = filtered_low_cap.load(Ordering::SeqCst);
let final_filtered_no_price = filtered_no_price.load(Ordering::SeqCst);
let final_failed = failed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
"✅ Completed: {} total companies ({} new, {} updated)",
final_count, final_new, final_updated
)).await;
logger::log_info(&format!(
" Valid: {}, Filtered (low cap): {}, Filtered (no price): {}, Failed: {}",
final_valid, final_filtered_low_cap, final_filtered_no_price, final_failed
)).await;
// Shutdown Yahoo pool
yahoo_pool.shutdown().await?;
Ok(final_valid)
}
/// Helper function to spawn a validation task (reduces code duplication)
fn spawn_validation_task(
company: CompanyCrossPlatformInfo,
yahoo_pool: &Arc<YahooClientPool>,
paths: &Arc<DataPaths>,
write_tx: &mpsc::Sender<LogCommand>,
shutdown_flag: &Arc<AtomicBool>,
processed: &Arc<AtomicUsize>,
valid_count: &Arc<AtomicUsize>,
filtered_low_cap: &Arc<AtomicUsize>,
filtered_no_price: &Arc<AtomicUsize>,
failed_count: &Arc<AtomicUsize>,
total: usize,
tasks: &mut FuturesUnordered<tokio::task::JoinHandle<anyhow::Result<Option<CompanyTaskResult>>>>,
) {
let yahoo_pool_clone = Arc::clone(yahoo_pool);
let paths_clone = Arc::clone(paths);
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let write_tx_clone = write_tx.clone();
let processed_clone = Arc::clone(processed);
let valid_count_clone = Arc::clone(valid_count);
let filtered_low_cap_clone = Arc::clone(filtered_low_cap);
let filtered_no_price_clone = Arc::clone(filtered_no_price);
let failed_count_clone = Arc::clone(failed_count);
let task = tokio::spawn(async move {
// Check shutdown at start
if shutdown_flag_clone.load(Ordering::SeqCst) {
return Ok::<_, anyhow::Error>(None);
}
let result = process_company_with_validation(
&company,
&yahoo_pool_clone,
&*paths_clone,
).await;
let task_result = match result {
CompanyProcessResult::Valid(validated_company) => {
// Send to writer
let _ = write_tx_clone.send(LogCommand::Write(validated_company.clone())).await;
valid_count_clone.fetch_add(1, Ordering::SeqCst);
Some(CompanyTaskResult {
company: validated_company.clone(),
result: CompanyProcessResult::Valid(validated_company),
})
}
CompanyProcessResult::FilteredLowCap { name, market_cap } => {
filtered_low_cap_clone.fetch_add(1, Ordering::SeqCst);
if filtered_low_cap_clone.load(Ordering::SeqCst) <= 10 {
logger::log_info(&format!(" Filtered {} - low market cap: {:.0} EUR", name, market_cap)).await;
}
None
}
CompanyProcessResult::FilteredNoPrice { name } => {
filtered_no_price_clone.fetch_add(1, Ordering::SeqCst);
if filtered_no_price_clone.load(Ordering::SeqCst) <= 10 {
logger::log_info(&format!(" Filtered {} - no recent price data", name)).await;
}
None
}
CompanyProcessResult::Failed { company: failed_company, error, is_transient: _ } => {
failed_count_clone.fetch_add(1, Ordering::SeqCst);
logger::log_warn(&format!(" Failed to process '{}': {}", failed_company.name, error)).await;
None
}
};
// Progress reporting
let current = processed_clone.fetch_add(1, Ordering::SeqCst) + 1;
if current % 100 == 0 {
logger::log_info(&format!(
"Progress: {}/{} ({} valid, {} low cap, {} no price, {} failed)",
current, total,
valid_count_clone.load(Ordering::SeqCst),
filtered_low_cap_clone.load(Ordering::SeqCst),
filtered_no_price_clone.load(Ordering::SeqCst),
failed_count_clone.load(Ordering::SeqCst)
)).await;
}
Ok(task_result)
});
tasks.push(task);
}
/// Process a single company with full error categorization
async fn process_company_with_validation(
company: &CompanyCrossPlatformInfo,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> CompanyProcessResult {
// Extract Yahoo ticker
let ticker = match extract_first_yahoo_ticker(company) {
Some(t) => t,
None => {
return CompanyProcessResult::Failed {
company: company.clone(),
error: "No valid Yahoo ticker found".to_string(),
is_transient: false, // Permanent - no ticker means no data
};
}
};
// Fetch core modules from Yahoo
let summary = match yahoo_pool.get_quote_summary(
&ticker,
&QuoteSummaryModule::core_modules(),
).await {
Ok(s) => s,
Err(e) => {
let error_msg = e.to_string();
let is_transient = is_transient_error(&error_msg);
return CompanyProcessResult::Failed {
company: company.clone(),
error: format!("API error fetching summary: {}", error_msg),
is_transient,
};
}
};
// Validate market cap
let market_cap = extract_market_cap(&summary);
if market_cap < 1_000_000.0 {
return CompanyProcessResult::FilteredLowCap {
name: company.name.clone(),
market_cap,
};
}
// Validate recent price activity
let has_recent_price = match check_recent_price_activity(yahoo_pool, &ticker).await {
Ok(has) => has,
Err(e) => {
let error_msg = e.to_string();
let is_transient = is_transient_error(&error_msg);
return CompanyProcessResult::Failed {
company: company.clone(),
error: format!("API error fetching price history: {}", error_msg),
is_transient,
};
}
};
if !has_recent_price {
return CompanyProcessResult::FilteredNoPrice {
name: company.name.clone(),
};
}
// Save core data
if let Err(e) = save_company_core_data(paths, &company.name, &summary).await {
logger::log_warn(&format!(
" Failed to save core data for {}: {}",
company.name, e
)).await;
}
CompanyProcessResult::Valid(company.clone())
}
/// Determine if an error is transient (should retry) or permanent (skip)
fn is_transient_error(error: &str) -> bool {
let error_lower = error.to_lowercase();
// Transient errors (network, rate limiting, timeouts)
let transient_patterns = [
"timeout",
"timed out",
"connection",
"network",
"rate limit",
"too many requests",
"429",
"503",
"502",
"500",
"temporarily",
"unavailable",
];
for pattern in &transient_patterns {
if error_lower.contains(pattern) {
return true;
}
}
// Permanent errors (invalid ticker, no data, parsing errors)
let permanent_patterns = [
"404",
"not found",
"invalid",
"no data",
"parse error",
"400",
"401",
"403",
];
for pattern in &permanent_patterns {
if error_lower.contains(pattern) {
return false;
}
}
// Default: treat unknown errors as transient (safer to retry)
true
}
/// Load companies from JSONL file
async fn load_companies_from_jsonl(path: &std::path::Path) -> anyhow::Result<Vec<CompanyCrossPlatformInfo>> {
let content = tokio::fs::read_to_string(path).await?;
let mut companies = Vec::new();
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(company) = serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
companies.push(company);
}
}
Ok(companies)
}
fn extract_first_yahoo_ticker(company: &CompanyCrossPlatformInfo) -> Option<String> {
for tickers in company.isin_tickers_map.values() {
for ticker in tickers {
if ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
{
return Some(ticker.trim_start_matches("YAHOO:").to_string());
}
}
}
None
}
fn extract_market_cap(summary: &crate::scraper::yahoo::QuoteSummary) -> f64 {
let price_module = match summary.modules.get("price") {
Some(m) => m,
None => return 0.0,
};
let market_cap_raw = price_module
.get("marketCap")
.and_then(|v| v.get("raw"))
.and_then(|v| v.as_f64())
.unwrap_or(0.0);
let currency = price_module
.get("currency")
.and_then(|v| v.as_str())
.unwrap_or("USD");
let market_cap_eur = match currency {
"EUR" => market_cap_raw,
"USD" => market_cap_raw * 0.92,
"GBP" => market_cap_raw * 1.17,
"JPY" => market_cap_raw * 0.0061,
"CHF" => market_cap_raw * 1.05,
_ => market_cap_raw * 0.92,
};
market_cap_eur
}
async fn check_recent_price_activity(
yahoo_pool: &Arc<YahooClientPool>,
ticker: &str,
) -> anyhow::Result<bool> {
let now = Utc::now().timestamp();
let one_year_ago = now - (365 * 24 * 60 * 60);
let sixty_days_ago = now - (60 * 24 * 60 * 60);
let chart_data = yahoo_pool.get_chart_data(
ticker,
"1d",
sixty_days_ago,
now,
).await?;
if chart_data.quotes.is_empty() {
return Ok(false);
}
let most_recent_timestamp = chart_data.quotes
.iter()
.map(|q| q.timestamp)
.max()
.unwrap_or(0);
Ok(most_recent_timestamp >= one_year_ago)
}
async fn save_company_core_data(
paths: &DataPaths,
company_name: &str,
summary: &crate::scraper::yahoo::QuoteSummary,
) -> anyhow::Result<()> {
use tokio::fs;
let safe_name = company_name
.replace("/", "_")
.replace("\\", "_")
.replace(":", "_")
.replace("*", "_")
.replace("?", "_")
.replace("\"", "_")
.replace("<", "_")
.replace(">", "_")
.replace("|", "_");
let company_dir = paths.corporate_dir().join(&safe_name).join("core");
fs::create_dir_all(&company_dir).await?;
let data_path = company_dir.join("data.jsonl");
let json_line = serde_json::to_string(summary)?;
let mut file = fs::File::create(&data_path).await?;
file.write_all(json_line.as_bytes()).await?;
file.write_all(b"\n").await?;
file.flush().await?;
Ok(())
}
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> { async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
let map_cache_dir = paths.cache_gleif_openfigi_map_dir(); let map_cache_dir = paths.cache_gleif_openfigi_map_dir();

View File

@@ -316,7 +316,7 @@ pub async fn fetch_earnings_with_pool(
ticker: &str, ticker: &str,
) -> anyhow::Result<Vec<CompanyEvent>> { ) -> anyhow::Result<Vec<CompanyEvent>> {
let ticker = ticker.to_string(); let ticker = ticker.to_string();
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker); let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker);
let ticker_cloned = ticker.clone(); let ticker_cloned = ticker.clone();

View File

@@ -15,3 +15,7 @@ pub use monitoring::{init_monitoring, ConfigSnapshot, MonitoringEvent};
pub use config::Config; pub use config::Config;
pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask}; pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask};
pub use util::logger; pub use util::logger;
pub use scraper::yahoo::{
YahooClient, YahooClientPool, QuoteSummaryModule, QuoteSummary, ChartData,
OptionsData, SearchResult
};

View File

@@ -250,6 +250,35 @@
text-transform: uppercase; text-transform: uppercase;
} }
/* Yahoo Stats */
.yahoo-stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 12px;
margin-top: 10px;
}
.yahoo-stat-box {
background: #2a3a4a;
padding: 15px;
border-radius: 5px;
text-align: center;
border-left: 4px solid #FF9800;
}
.yahoo-stat-value {
font-size: 28px;
font-weight: bold;
color: #FF9800;
margin-bottom: 5px;
}
.yahoo-stat-label {
font-size: 11px;
color: #aaa;
text-transform: uppercase;
}
/* Logs */ /* Logs */
.log-container { .log-container {
max-height: 300px; max-height: 300px;
@@ -339,6 +368,31 @@
.pulse { .pulse {
animation: pulse 2s infinite; animation: pulse 2s infinite;
} }
/* Yahoo Client Box */
.yahoo-client-box {
background: #2a3a4a;
border: 2px solid #FF9800;
border-radius: 5px;
padding: 12px;
display: flex;
gap: 0;
overflow: hidden;
}
.yahoo-client-side {
flex: 1;
padding: 12px;
}
.yahoo-client-side.left {
background: #3a4a5a;
border-right: 1px solid #555;
}
.yahoo-client-side.right {
background: #2a3a4a;
}
</style> </style>
</head> </head>
<body> <body>
@@ -363,6 +417,13 @@
<div class="instance-grid" id="instances"></div> <div class="instance-grid" id="instances"></div>
</div> </div>
<!-- Yahoo API Section -->
<div class="section">
<div class="section-title">📈 YAHOO API METRICS</div>
<div class="yahoo-stats-grid" id="yahoo-stats"></div>
<div class="instance-grid" id="yahoo-clients"></div>
</div>
<!-- Global Metrics Section --> <!-- Global Metrics Section -->
<div class="section"> <div class="section">
<div class="section-title">📊 GLOBAL METRICS</div> <div class="section-title">📊 GLOBAL METRICS</div>
@@ -432,6 +493,8 @@
updateConfig(state.config); updateConfig(state.config);
updateInstances(state.instances); updateInstances(state.instances);
updateGlobalStats(state.global); updateGlobalStats(state.global);
updateYahooStats(state.global);
updateYahooClients(state.yahoo_clients);
updateLogs(state.logs); updateLogs(state.logs);
} }
@@ -480,6 +543,10 @@
? ((inst.success_count / inst.total_requests) * 100).toFixed(1) ? ((inst.success_count / inst.total_requests) * 100).toFixed(1)
: '0.0'; : '0.0';
const yahooSuccessRate = inst.yahoo_requests > 0
? ((inst.yahoo_success / inst.yahoo_requests) * 100).toFixed(1)
: '0.0';
return ` return `
<div class="instance-box ${statusClass}"> <div class="instance-box ${statusClass}">
<div class="instance-side"> <div class="instance-side">
@@ -511,6 +578,16 @@
${successRate}% ${successRate}%
</span> </span>
</div> </div>
<div class="metric-row">
<span class="metric-label">Yahoo Requests</span>
<span class="metric-value">${inst.yahoo_requests}</span>
</div>
<div class="metric-row">
<span class="metric-label">Yahoo Rate</span>
<span class="metric-value ${yahooSuccessRate < 50 ? 'danger' : yahooSuccessRate < 80 ? 'warning' : ''}">
${yahooSuccessRate}%
</span>
</div>
<div class="metric-row"> <div class="metric-row">
<span class="metric-label">Last Activity</span> <span class="metric-label">Last Activity</span>
<span class="metric-value">${inst.last_activity}</span> <span class="metric-value">${inst.last_activity}</span>
@@ -556,6 +633,115 @@
}).join(''); }).join('');
} }
function updateYahooStats(global) {
const container = document.getElementById('yahoo-stats');
const yahooSuccessRate = global.total_yahoo_requests > 0
? ((global.successful_yahoo_requests / global.total_yahoo_requests) * 100).toFixed(1)
: '0.0';
container.innerHTML = `
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.total_yahoo_requests || 0}</div>
<div class="yahoo-stat-label">Total Requests</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${yahooSuccessRate}%</div>
<div class="yahoo-stat-label">Success Rate</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.successful_yahoo_requests || 0}</div>
<div class="yahoo-stat-label">Successful</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.failed_yahoo_requests || 0}</div>
<div class="yahoo-stat-label">Failed</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.yahoo_client_count || 0}</div>
<div class="yahoo-stat-label">Active Clients</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.yahoo_batch_requests || 0}</div>
<div class="yahoo-stat-label">Batch Requests</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.yahoo_session_renewals || 0}</div>
<div class="yahoo-stat-label">Session Renewals</div>
</div>
`;
}
function updateYahooClients(yahooClients) {
const container = document.getElementById('yahoo-clients');
if (!yahooClients || yahooClients.length === 0) {
container.innerHTML = '<div style="text-align: center; padding: 40px; color: #666;">No Yahoo clients available</div>';
return;
}
container.innerHTML = yahooClients.map(client => {
const successRate = client.requests_total > 0
? ((client.requests_successful / client.requests_total) * 100).toFixed(1)
: '0.0';
return `
<div class="yahoo-client-box">
<div class="yahoo-client-side left">
<div class="side-header">
📊 Yahoo Client #${client.instance_id}
${client.has_proxy ? '🔗' : '🌐'}
</div>
<div class="metric-row">
<span class="metric-label">Total Requests</span>
<span class="metric-value">${client.requests_total}</span>
</div>
<div class="metric-row">
<span class="metric-label">Success / Fail</span>
<span class="metric-value">${client.requests_successful} / ${client.requests_failed}</span>
</div>
<div class="metric-row">
<span class="metric-label">Success Rate</span>
<span class="metric-value ${successRate < 50 ? 'danger' : successRate < 80 ? 'warning' : ''}">
${successRate}%
</span>
</div>
<div class="metric-row">
<span class="metric-label">Current / Max</span>
<span class="metric-value ${client.current_requests >= client.max_requests ? 'danger' : ''}">
${client.current_requests} / ${client.max_requests}
</span>
</div>
<div class="metric-row">
<span class="metric-label">Last Activity</span>
<span class="metric-value">${client.last_activity}</span>
</div>
</div>
<div class="yahoo-client-side right">
${client.proxy_info ? `
<div class="side-header">🔗 ${client.proxy_info.container_name}</div>
<div class="metric-row">
<span class="metric-label">IP Address</span>
<span class="metric-value">${client.proxy_info.ip_address}</span>
</div>
<div class="metric-row">
<span class="metric-label">Port</span>
<span class="metric-value">${client.proxy_info.port}</span>
</div>
<div class="metric-row">
<span class="metric-label">Status</span>
<span class="metric-value">${client.proxy_info.status}</span>
</div>
` : `
<div class="no-proxy">
${client.has_proxy ? '⚠️' : '🌐'}<br>
${client.has_proxy ? 'Proxy Not Connected' : 'Direct Connection'}
</div>
`}
</div>
</div>
`;
}).join('');
}
function updateGlobalStats(global) { function updateGlobalStats(global) {
const container = document.getElementById('global-stats'); const container = document.getElementById('global-stats');

View File

@@ -92,6 +92,45 @@ pub enum MonitoringEvent {
reason: String, reason: String,
}, },
// Yahoo API events
YahooRequestStarted {
instance_id: usize,
endpoint: String,
symbol: Option<String>,
},
YahooRequestCompleted {
instance_id: usize,
success: bool,
duration_ms: u64,
error: Option<String>,
},
YahooBatchRequestStarted {
count: usize,
symbols: Vec<String>,
endpoint: String,
},
YahooBatchRequestCompleted {
successful: usize,
failed: usize,
total: usize,
duration_ms: u64,
},
YahooClientCreated {
instance_id: usize,
has_proxy: bool,
max_requests: u32,
},
YahooClientReset {
instance_id: usize,
previous_requests: u32,
reason: String,
},
// Logging // Logging
LogMessage { LogMessage {
level: LogLevel, level: LogLevel,

View File

@@ -9,6 +9,7 @@ pub struct DashboardState {
pub config: ConfigSnapshot, pub config: ConfigSnapshot,
pub instances: Vec<InstanceMetrics>, pub instances: Vec<InstanceMetrics>,
pub proxies: Vec<ProxyMetrics>, pub proxies: Vec<ProxyMetrics>,
pub yahoo_clients: Vec<YahooClientMetrics>,
pub global: GlobalMetrics, pub global: GlobalMetrics,
pub logs: Vec<LogEntry>, pub logs: Vec<LogEntry>,
} }
@@ -38,6 +39,14 @@ pub struct InstanceMetrics {
pub failure_count: usize, pub failure_count: usize,
pub connected_proxy: Option<ProxyInfo>, pub connected_proxy: Option<ProxyInfo>,
pub last_activity: String, // Timestamp pub last_activity: String, // Timestamp
pub yahoo_requests: usize,
pub yahoo_success: usize,
pub yahoo_failures: usize,
pub yahoo_success_rate: f64,
pub yahoo_current_requests: u32,
pub yahoo_max_requests: u32,
pub yahoo_last_endpoint: Option<String>,
pub yahoo_last_symbol: Option<String>,
} }
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -75,6 +84,20 @@ pub struct ProxyMetrics {
pub instances_using: Vec<usize>, pub instances_using: Vec<usize>,
} }
/// Metrics for a Yahoo client
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct YahooClientMetrics {
pub instance_id: usize,
pub requests_total: usize,
pub requests_successful: usize,
pub requests_failed: usize,
pub current_requests: u32,
pub max_requests: u32,
pub has_proxy: bool,
pub last_activity: String,
pub proxy_info: Option<ProxyInfo>,
}
/// Global pool metrics /// Global pool metrics
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GlobalMetrics { pub struct GlobalMetrics {
@@ -88,6 +111,13 @@ pub struct GlobalMetrics {
pub bot_detection_hits: usize, pub bot_detection_hits: usize,
pub proxy_failures: usize, pub proxy_failures: usize,
pub uptime_seconds: u64, pub uptime_seconds: u64,
pub total_yahoo_requests: usize,
pub successful_yahoo_requests: usize,
pub failed_yahoo_requests: usize,
pub yahoo_success_rate: f64,
pub yahoo_batch_requests: usize,
pub yahoo_session_renewals: usize,
pub yahoo_client_count: usize,
} }
/// Log entry for display in dashboard /// Log entry for display in dashboard
@@ -111,6 +141,7 @@ pub enum LogLevel {
pub struct MonitoringState { pub struct MonitoringState {
pub instances: HashMap<usize, InstanceState>, pub instances: HashMap<usize, InstanceState>,
pub proxies: HashMap<String, ProxyState>, pub proxies: HashMap<String, ProxyState>,
pub yahoo_clients: HashMap<usize, YahooClientState>,
pub global: GlobalState, pub global: GlobalState,
pub start_time: Instant, pub start_time: Instant,
} }
@@ -128,6 +159,13 @@ pub struct InstanceState {
pub failure_count: usize, pub failure_count: usize,
pub connected_proxy: Option<ProxyInfo>, pub connected_proxy: Option<ProxyInfo>,
pub last_activity: Instant, pub last_activity: Instant,
pub yahoo_requests: usize,
pub yahoo_success: usize,
pub yahoo_failures: usize,
pub yahoo_current_requests: u32,
pub yahoo_max_requests: u32,
pub yahoo_last_endpoint: Option<String>,
pub yahoo_last_symbol: Option<String>,
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@@ -139,6 +177,19 @@ pub struct ProxyState {
pub instances_using: Vec<usize>, pub instances_using: Vec<usize>,
} }
#[derive(Debug, Clone)]
pub struct YahooClientState {
pub instance_id: usize,
pub requests_total: usize,
pub requests_successful: usize,
pub requests_failed: usize,
pub current_requests: u32,
pub max_requests: u32,
pub has_proxy: bool,
pub last_activity: Instant,
pub proxy_info: Option<ProxyInfo>,
}
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct GlobalState { pub struct GlobalState {
pub total_requests: usize, pub total_requests: usize,
@@ -149,6 +200,12 @@ pub struct GlobalState {
pub navigation_timeouts: usize, pub navigation_timeouts: usize,
pub bot_detection_hits: usize, pub bot_detection_hits: usize,
pub proxy_failures: usize, pub proxy_failures: usize,
pub total_yahoo_requests: usize,
pub successful_yahoo_requests: usize,
pub failed_yahoo_requests: usize,
pub yahoo_batch_requests: usize,
pub yahoo_session_renewals: usize,
pub yahoo_client_count: usize,
} }
impl MonitoringState { impl MonitoringState {
@@ -156,6 +213,7 @@ impl MonitoringState {
Self { Self {
instances: HashMap::new(), instances: HashMap::new(),
proxies: HashMap::new(), proxies: HashMap::new(),
yahoo_clients: HashMap::new(),
global: GlobalState { global: GlobalState {
total_requests: 0, total_requests: 0,
successful_requests: 0, successful_requests: 0,
@@ -165,6 +223,12 @@ impl MonitoringState {
navigation_timeouts: 0, navigation_timeouts: 0,
bot_detection_hits: 0, bot_detection_hits: 0,
proxy_failures: 0, proxy_failures: 0,
total_yahoo_requests: 0,
successful_yahoo_requests: 0,
failed_yahoo_requests: 0,
yahoo_batch_requests: 0,
yahoo_session_renewals: 0,
yahoo_client_count: 0,
}, },
start_time: Instant::now(), start_time: Instant::now(),
} }
@@ -175,7 +239,14 @@ impl MonitoringState {
let instances: Vec<InstanceMetrics> = self let instances: Vec<InstanceMetrics> = self
.instances .instances
.values() .values()
.map(|inst| InstanceMetrics { .map(|inst| {
let yahoo_success_rate = if inst.yahoo_success + inst.yahoo_failures > 0 {
(inst.yahoo_success as f64 / (inst.yahoo_success + inst.yahoo_failures) as f64) * 100.0
} else {
0.0
};
InstanceMetrics {
id: inst.id, id: inst.id,
status: inst.status.clone(), status: inst.status.clone(),
current_task: inst.current_task.clone(), current_task: inst.current_task.clone(),
@@ -187,6 +258,15 @@ impl MonitoringState {
failure_count: inst.failure_count, failure_count: inst.failure_count,
connected_proxy: inst.connected_proxy.clone(), connected_proxy: inst.connected_proxy.clone(),
last_activity: format_timestamp(inst.last_activity), last_activity: format_timestamp(inst.last_activity),
yahoo_requests: inst.yahoo_requests,
yahoo_success: inst.yahoo_success,
yahoo_failures: inst.yahoo_failures,
yahoo_success_rate,
yahoo_current_requests: inst.yahoo_current_requests,
yahoo_max_requests: inst.yahoo_max_requests,
yahoo_last_endpoint: inst.yahoo_last_endpoint.clone(),
yahoo_last_symbol: inst.yahoo_last_symbol.clone(),
}
}) })
.collect(); .collect();
@@ -202,12 +282,34 @@ impl MonitoringState {
}) })
.collect(); .collect();
let yahoo_clients: Vec<YahooClientMetrics> = self
.yahoo_clients
.values()
.map(|client| YahooClientMetrics {
instance_id: client.instance_id,
requests_total: client.requests_total,
requests_successful: client.requests_successful,
requests_failed: client.requests_failed,
current_requests: client.current_requests,
max_requests: client.max_requests,
has_proxy: client.has_proxy,
last_activity: format_timestamp(client.last_activity),
proxy_info: client.proxy_info.clone(),
})
.collect();
let success_rate = if self.global.total_requests > 0 { let success_rate = if self.global.total_requests > 0 {
(self.global.successful_requests as f64 / self.global.total_requests as f64) * 100.0 (self.global.successful_requests as f64 / self.global.total_requests as f64) * 100.0
} else { } else {
0.0 0.0
}; };
let yahoo_success_rate = if self.global.total_yahoo_requests > 0 {
(self.global.successful_yahoo_requests as f64 / self.global.total_yahoo_requests as f64) * 100.0
} else {
0.0
};
let global = GlobalMetrics { let global = GlobalMetrics {
total_requests: self.global.total_requests, total_requests: self.global.total_requests,
successful_requests: self.global.successful_requests, successful_requests: self.global.successful_requests,
@@ -219,12 +321,20 @@ impl MonitoringState {
bot_detection_hits: self.global.bot_detection_hits, bot_detection_hits: self.global.bot_detection_hits,
proxy_failures: self.global.proxy_failures, proxy_failures: self.global.proxy_failures,
uptime_seconds: self.start_time.elapsed().as_secs(), uptime_seconds: self.start_time.elapsed().as_secs(),
total_yahoo_requests: self.global.total_yahoo_requests,
successful_yahoo_requests: self.global.successful_yahoo_requests,
failed_yahoo_requests: self.global.failed_yahoo_requests,
yahoo_success_rate,
yahoo_batch_requests: self.global.yahoo_batch_requests,
yahoo_session_renewals: self.global.yahoo_session_renewals,
yahoo_client_count: self.global.yahoo_client_count,
}; };
DashboardState { DashboardState {
config, config,
instances, instances,
proxies, proxies,
yahoo_clients,
global, global,
logs, logs,
} }
@@ -233,7 +343,6 @@ impl MonitoringState {
fn format_timestamp(instant: Instant) -> String { fn format_timestamp(instant: Instant) -> String {
use chrono::Local; use chrono::Local;
// This is a placeholder - in real impl we'd track actual wall-clock time
Local::now().format("%H:%M:%S").to_string() Local::now().format("%H:%M:%S").to_string()
} }

View File

@@ -76,6 +76,13 @@ impl MonitoringService {
failure_count: 0, failure_count: 0,
connected_proxy: proxy.clone(), connected_proxy: proxy.clone(),
last_activity: Instant::now(), last_activity: Instant::now(),
yahoo_requests: 0,
yahoo_success: 0,
yahoo_failures: 0,
yahoo_current_requests: 0,
yahoo_max_requests: 0,
yahoo_last_endpoint: None,
yahoo_last_symbol: None,
}, },
); );
@@ -193,9 +200,9 @@ impl MonitoringService {
if let Some(inst) = state.instances.get(&instance_id) { if let Some(inst) = state.instances.get(&instance_id) {
Some(SessionSummary { Some(SessionSummary {
instance_id, instance_id,
session_start: "N/A".to_string(), // We'd need to track this session_start: "N/A".to_string(),
session_end: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(), session_end: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
duration_seconds: 0, // We'd need to track session start time duration_seconds: 0,
total_requests: old_request_count, total_requests: old_request_count,
successful_requests: inst.success_count, successful_requests: inst.success_count,
failed_requests: inst.failure_count, failed_requests: inst.failure_count,
@@ -283,6 +290,154 @@ impl MonitoringService {
self.log_info(format!("Pool rotation triggered: {}", reason)).await; self.log_info(format!("Pool rotation triggered: {}", reason)).await;
} }
// Yahoo API Events
MonitoringEvent::YahooRequestStarted { instance_id, endpoint, symbol } => {
let mut state = self.state.write().await;
// Update global Yahoo stats
state.global.total_yahoo_requests += 1;
// Update instance stats
if let Some(inst) = state.instances.get_mut(&instance_id) {
inst.yahoo_requests += 1;
inst.yahoo_current_requests += 1;
inst.yahoo_last_endpoint = Some(endpoint.clone());
inst.yahoo_last_symbol = symbol.clone();
inst.last_activity = Instant::now();
}
// Update Yahoo client stats
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
client.requests_total += 1;
client.current_requests += 1;
client.last_activity = Instant::now();
}
self.log_info(format!(
"YahooClient[{}] started request: {} {}",
instance_id,
endpoint,
symbol.unwrap_or_else(|| "search".to_string())
)).await;
}
MonitoringEvent::YahooRequestCompleted { instance_id, success, duration_ms, error } => {
let mut state = self.state.write().await;
// Update global Yahoo stats
if success {
state.global.successful_yahoo_requests += 1;
} else {
state.global.failed_yahoo_requests += 1;
}
// Update instance stats
if let Some(inst) = state.instances.get_mut(&instance_id) {
inst.yahoo_current_requests = inst.yahoo_current_requests.saturating_sub(1);
if success {
inst.yahoo_success += 1;
} else {
inst.yahoo_failures += 1;
}
inst.last_activity = Instant::now();
}
// Update Yahoo client stats
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
client.current_requests = client.current_requests.saturating_sub(1);
if success {
client.requests_successful += 1;
} else {
client.requests_failed += 1;
}
client.last_activity = Instant::now();
}
if success {
self.log_info(format!(
"YahooClient[{}] completed request in {}ms",
instance_id, duration_ms
)).await;
} else {
self.log_error(format!(
"YahooClient[{}] failed request in {}ms: {}",
instance_id,
duration_ms,
error.unwrap_or_else(|| "unknown error".to_string())
)).await;
}
}
MonitoringEvent::YahooBatchRequestStarted { count, symbols, endpoint } => {
let mut state = self.state.write().await;
state.global.yahoo_batch_requests += 1;
self.log_info(format!(
"Yahoo batch request started: {} symbols, endpoint: {}",
count, endpoint
)).await;
if !symbols.is_empty() {
self.log_debug(format!(
"Batch symbols: {}",
symbols.join(", ")
)).await;
}
}
MonitoringEvent::YahooBatchRequestCompleted { successful, failed, total, duration_ms } => {
let success_rate = if total > 0 {
(successful as f64 / total as f64) * 100.0
} else {
0.0
};
self.log_info(format!(
"Yahoo batch completed: {}/{} successful ({:.1}%) in {}ms",
successful, total, success_rate, duration_ms
)).await;
}
MonitoringEvent::YahooClientCreated { instance_id, has_proxy, max_requests } => {
let mut state = self.state.write().await;
state.global.yahoo_client_count += 1;
state.yahoo_clients.insert(
instance_id,
YahooClientState {
instance_id,
requests_total: 0,
requests_successful: 0,
requests_failed: 0,
current_requests: 0,
max_requests,
has_proxy,
last_activity: Instant::now(),
proxy_info: None,
},
);
self.log_info(format!(
"YahooClient[{}] created (proxy: {}, max requests: {})",
instance_id, has_proxy, max_requests
)).await;
}
MonitoringEvent::YahooClientReset { instance_id, previous_requests, reason } => {
let mut state = self.state.write().await;
state.global.yahoo_session_renewals += 1;
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
client.current_requests = 0;
client.last_activity = Instant::now();
}
self.log_info(format!(
"YahooClient[{}] reset (had {} requests, reason: {})",
instance_id, previous_requests, reason
)).await;
}
MonitoringEvent::LogMessage { level, message } => { MonitoringEvent::LogMessage { level, message } => {
match level { match level {
crate::monitoring::events::LogLevel::Info => self.log_info(message).await, crate::monitoring::events::LogLevel::Info => self.log_info(message).await,
@@ -317,6 +472,17 @@ impl MonitoringService {
}).await; }).await;
} }
async fn log_debug(&self, message: String) {
// Only log debug if DEBUG_LOGGING is enabled
if std::env::var("DEBUG_LOGGING").is_ok() {
self.add_log(LogEntry {
timestamp: Local::now().format("%H:%M:%S").to_string(),
level: super::metrics::LogLevel::Info,
message: format!("[DEBUG] {}", message),
}).await;
}
}
async fn add_log(&self, entry: LogEntry) { async fn add_log(&self, entry: LogEntry) {
let mut logs = self.logs.write().await; let mut logs = self.logs.write().await;
if logs.len() >= MAX_LOGS { if logs.len() >= MAX_LOGS {

View File

@@ -355,7 +355,7 @@ impl DockerVpnProxyPool {
pub fn get_proxy_url(&self, index: usize) -> String { pub fn get_proxy_url(&self, index: usize) -> String {
let port = self.proxy_ports[index % self.proxy_ports.len()]; let port = self.proxy_ports[index % self.proxy_ports.len()];
format!("socks5://localhost:{}", port) format!("socks5h://localhost:{}", port)
} }
pub fn num_proxies(&self) -> usize { pub fn num_proxies(&self) -> usize {

View File

@@ -2,3 +2,4 @@ pub mod webdriver;
pub mod docker_vpn_proxy; pub mod docker_vpn_proxy;
pub mod helpers; pub mod helpers;
pub mod hard_reset; pub mod hard_reset;
pub mod yahoo;

View File

@@ -582,6 +582,9 @@ impl ChromeDriverPool {
self.instances.len() self.instances.len()
} }
} }
pub fn get_proxy_pool(&self) -> Option<Arc<DockerVpnProxyPool>> {
self.proxy_pool.clone()
}
} }
/// Represents a single instance of chromedriver process, optionally bound to a VPN. /// Represents a single instance of chromedriver process, optionally bound to a VPN.

1349
src/scraper/yahoo.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,7 @@ use std::path::{Path, PathBuf};
use std::fs; use std::fs;
/// Central configuration for all data paths /// Central configuration for all data paths
#[derive(Clone)]
pub struct DataPaths { pub struct DataPaths {
base_dir: PathBuf, base_dir: PathBuf,
data_dir: PathBuf, data_dir: PathBuf,
@@ -16,6 +17,7 @@ pub struct DataPaths {
economic_events_dir: PathBuf, economic_events_dir: PathBuf,
economic_changes_dir: PathBuf, economic_changes_dir: PathBuf,
// Corporate data subdirectories // Corporate data subdirectories
corporate_dir: PathBuf,
corporate_events_dir: PathBuf, corporate_events_dir: PathBuf,
corporate_changes_dir: PathBuf, corporate_changes_dir: PathBuf,
corporate_prices_dir: PathBuf, corporate_prices_dir: PathBuf,
@@ -56,6 +58,7 @@ impl DataPaths {
fs::create_dir_all(&cache_openvpn_dir)?; fs::create_dir_all(&cache_openvpn_dir)?;
fs::create_dir_all(&economic_events_dir)?; fs::create_dir_all(&economic_events_dir)?;
fs::create_dir_all(&economic_changes_dir)?; fs::create_dir_all(&economic_changes_dir)?;
fs::create_dir_all(&corporate_dir)?;
fs::create_dir_all(&corporate_events_dir)?; fs::create_dir_all(&corporate_events_dir)?;
fs::create_dir_all(&corporate_changes_dir)?; fs::create_dir_all(&corporate_changes_dir)?;
fs::create_dir_all(&corporate_prices_dir)?; fs::create_dir_all(&corporate_prices_dir)?;
@@ -71,6 +74,7 @@ impl DataPaths {
cache_openvpn_dir, cache_openvpn_dir,
economic_events_dir, economic_events_dir,
economic_changes_dir, economic_changes_dir,
corporate_dir,
corporate_events_dir, corporate_events_dir,
corporate_changes_dir, corporate_changes_dir,
corporate_prices_dir, corporate_prices_dir,
@@ -119,6 +123,11 @@ impl DataPaths {
&self.economic_changes_dir &self.economic_changes_dir
} }
/// Get the corporate events directory
pub fn corporate_dir(&self) -> &Path {
&self.corporate_dir
}
/// Get the corporate events directory /// Get the corporate events directory
pub fn corporate_events_dir(&self) -> &Path { pub fn corporate_events_dir(&self) -> &Path {
&self.corporate_events_dir &self.corporate_events_dir