// src/corporate/update_companies_enrich.rs - MERGED VERSION WITH GENERIC ENRICHMENT use super::{types::*, helpers::*}; use crate::config::Config; use crate::corporate::checkpoint_helpers; use crate::util::directories::DataPaths; use crate::util::integrity::{StateManager, directory_reference, DataStage}; use crate::util::logger; use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule}; use std::result::Result::Ok; use chrono::{TimeZone, Utc}; use std::collections::{HashSet}; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use tokio::fs::{OpenOptions}; use tokio::io::{AsyncWriteExt}; use futures::stream::{FuturesUnordered, StreamExt}; use serde_json::json; use tokio::sync::mpsc; use std::future::Future; use std::pin::Pin; /// Log command enum (shared across all enrichment types) enum LogCommand { Write(serde_json::Value), Checkpoint, Shutdown, } /// Type alias for enrichment function type EnrichmentFn = Arc< dyn Fn(CompanyData, Arc, DataPaths) -> Pin> + Send>> + Send + Sync >; // ============================================================================ // EVENTS ENRICHMENT // ============================================================================ /// Yahoo Event enrichment per corporate company /// /// # Features /// - Graceful shutdown (abort-safe) /// - Task panic isolation (tasks fail independently) /// - Crash-safe persistence (checkpoint + log with fsync) /// - Smart skip logic (only process incomplete data) /// - Uses pending queue instead of retry mechanism /// - Content integrity validation with hash tracking /// /// # Persistence Strategy /// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state) /// - Log: companies_events_updates.log (append-only updates) /// - On restart: Load checkpoint + replay log /// - Periodic checkpoints (every 50 companies) /// - Batched fsync (every 10 writes or 10 seconds) /// - Hash validation of all event data directories pub async fn enrich_companies_with_events( paths: &DataPaths, _config: &Config, yahoo_pool: Arc, shutdown_flag: &Arc, ) -> anyhow::Result { // Configuration constants const CHECKPOINT_INTERVAL: usize = 50; const FSYNC_BATCH_SIZE: usize = 10; const FSYNC_INTERVAL_SECS: u64 = 10; const CONCURRENCY_LIMIT: usize = 50; let data_path = paths.data_dir(); // File paths let input_path = data_path.join("companies_yahoo_cleaned.jsonl"); let log_path = data_path.join("companies_events_updates.log"); // Check input exists if !input_path.exists() { logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping event enrichment").await; return Ok(0); } let manager = StateManager::new(paths.integrity_dir()).await?; let step_name = "yahoo_events_enrichment_complete"; if manager.is_step_valid(step_name).await? { logger::log_info(" Yahoo events enrichment already completed and valid").await; // Count enriched companies let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?; logger::log_info(&format!(" ✓ Found {} companies with valid event data", count)).await; return Ok(count); } logger::log_info(" Event data needs refresh - starting enrichment").await; // === RECOVERY PHASE: Track enriched companies === let enriched_companies: HashSet = checkpoint_helpers::load_enrichment_progress(&log_path).await?; // Load all companies from input logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await; let companies = load_companies_from_jsonl(&input_path).await?; let total_companies = companies.len(); logger::log_info(&format!("Found {} companies to process", total_companies)).await; // Filter companies that need enrichment let pending_companies: Vec = companies .into_iter() .filter(|company| !enriched_companies.contains(&company.name)) .collect(); let pending_count = pending_companies.len(); logger::log_info(&format!( " {} already enriched, {} pending", enriched_companies.len(), pending_count )).await; if pending_count == 0 { logger::log_info(" ✓ All companies already enriched").await; track_events_completion(&manager, paths, step_name).await?; return Ok(enriched_companies.len()); } // === PROCESSING PHASE: Enrich companies with events === // Create enrichment function let enrichment_fn: EnrichmentFn = Arc::new(move |company, pool, paths| { let company = company.clone(); let pool = Arc::clone(&pool); let paths = paths.clone(); Box::pin(async move { enrich_company_with_events(&company, &pool, &paths).await }) }); // Shared counters let processed_count = Arc::new(AtomicUsize::new(enriched_companies.len())); let success_count = Arc::new(AtomicUsize::new(enriched_companies.len())); let failed_count = Arc::new(AtomicUsize::new(0)); // Log writer channel with batching and fsync let (log_tx, log_rx) = mpsc::channel::(1000); // Spawn log writer task let log_writer_handle = spawn_log_writer( log_path, log_rx, Arc::clone(&processed_count), total_companies, FSYNC_BATCH_SIZE, FSYNC_INTERVAL_SECS, ); // Process companies concurrently with task panic isolation let mut tasks = FuturesUnordered::new(); let mut pending_iter = pending_companies.into_iter(); let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT)); // Initial batch of tasks for _ in 0..CONCURRENCY_LIMIT.min(pending_count) { if let Some(company) = pending_iter.next() { let task = spawn_enrichment_task( company, Arc::clone(&yahoo_pool), paths.clone(), Arc::clone(&processed_count), Arc::clone(&success_count), Arc::clone(&failed_count), log_tx.clone(), Arc::clone(&semaphore), Arc::clone(shutdown_flag), Arc::clone(&enrichment_fn), ); tasks.push(task); } } // Process results and spawn new tasks let mut checkpoint_counter = enriched_companies.len(); while let Some(result) = tasks.next().await { // Handle task result (even if panicked) match result { Ok(_) => { // Task completed successfully } Err(e) => { logger::log_warn(&format!("Task panicked: {}", e)).await; failed_count.fetch_add(1, Ordering::SeqCst); } } // Check for shutdown if shutdown_flag.load(Ordering::SeqCst) { logger::log_warn("Shutdown signal received, stopping event enrichment").await; break; } // Checkpoint periodically checkpoint_counter += 1; if checkpoint_counter % CHECKPOINT_INTERVAL == 0 { let _ = log_tx.send(LogCommand::Checkpoint).await; } // Spawn next task if available if let Some(company) = pending_iter.next() { let task = spawn_enrichment_task( company, Arc::clone(&yahoo_pool), paths.clone(), Arc::clone(&processed_count), Arc::clone(&success_count), Arc::clone(&failed_count), log_tx.clone(), Arc::clone(&semaphore), Arc::clone(shutdown_flag), Arc::clone(&enrichment_fn), ); tasks.push(task); } } // Signal log writer to shutdown let _ = log_tx.send(LogCommand::Shutdown).await; drop(log_tx); // Wait for log writer to finish let _ = log_writer_handle.await; let final_processed = processed_count.load(Ordering::SeqCst); let final_success = success_count.load(Ordering::SeqCst); let final_failed = failed_count.load(Ordering::SeqCst); logger::log_info(&format!( " Event enrichment summary: {} total, {} success, {} failed", final_processed, final_success, final_failed )).await; // Mark as complete if all companies processed if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) { track_events_completion(&manager, paths, step_name).await?; logger::log_info(" ✓ Event enrichment marked as complete with integrity tracking").await; } Ok(final_success) } /// Track event enrichment completion with content hash verification async fn track_events_completion( manager: &StateManager, paths: &DataPaths, step_name: &str, ) -> anyhow::Result<()> { // Create content reference for all event data // This will hash ALL files matching the pattern: {company}/events/data.jsonl let content_reference = directory_reference( paths.corporate_dir(), Some(vec![ "*/events/*.jsonl".to_string(), // Main pattern for events data "*/events/data.jsonl".to_string(), // Specific pattern (more precise) ]), Some(vec![ "*.log".to_string(), // Exclude log files "*.tmp".to_string(), // Exclude temp files "*.bak".to_string(), // Exclude backup files ]), ); // Track completion with: // - Content reference: All event directories // - Data stage: Data (7-day TTL by default) // - Dependencies: Depends on cleaned companies data manager.update_entry( step_name.to_string(), content_reference, DataStage::Data, None, // Use default TTL (7 days for Data stage) ).await?; Ok(()) } /// Enrich a single company with event data async fn enrich_company_with_events( company: &CompanyData, yahoo_pool: &Arc, paths: &DataPaths, ) -> anyhow::Result<()> { use std::collections::HashMap; let ticker = match extract_first_yahoo_ticker(company) { Some(t) => t, None => { return Err(anyhow::anyhow!("No valid Yahoo ticker found")); } }; // Combined summary to accumulate data from all available modules let mut combined_modules: HashMap = HashMap::new(); let timestamp = chrono::Utc::now().timestamp(); // Try each event module individually let event_modules = QuoteSummaryModule::event_modules(); for module in event_modules { match yahoo_pool.get_quote_summary(&ticker, &[module]).await { Ok(summary) => { // Merge this module's data into combined summary for (key, value) in summary.modules { combined_modules.insert(key, value); } } Err(e) => { // Module not available - silently continue for expected errors let err_str = e.to_string(); if err_str.contains("500") || err_str.contains("404") || err_str.contains("Not Found") { // Expected for securities without this data - continue silently continue; } else { // Unexpected error - log but continue trying other modules logger::log_warn(&format!( " Unexpected error fetching event module for {}: {}", ticker, e )).await; } } } } // Only save if we got at least some data if combined_modules.is_empty() { return Err(anyhow::anyhow!("No event data available for any module")); } // Create combined summary with all available modules let combined_summary = crate::scraper::yahoo::QuoteSummary { symbol: ticker.clone(), modules: combined_modules, timestamp, }; // Save the combined event data save_company_event_data(paths, &company.name, &combined_summary).await?; Ok(()) } /// Save event data to company directory async fn save_company_event_data( paths: &DataPaths, company_name: &str, summary: &crate::scraper::yahoo::QuoteSummary, ) -> anyhow::Result<()> { use tokio::fs; let safe_name = sanitize_company_name(company_name); let company_dir = paths.corporate_dir().join(&safe_name).join("events"); fs::create_dir_all(&company_dir).await?; let data_path = company_dir.join("data.jsonl"); let json_line = serde_json::to_string(summary)?; let mut file = fs::File::create(&data_path).await?; file.write_all(json_line.as_bytes()).await?; file.write_all(b"\n").await?; file.flush().await?; file.sync_all().await?; // Ensure data is persisted Ok(()) } // ============================================================================ // OPTION ENRICHMENT // ============================================================================ /// Yahoo Option enrichment per corporate company /// /// # Features /// - Graceful shutdown (abort-safe) /// - Task panic isolation (tasks fail independently) /// - Crash-safe persistence (checkpoint + log with fsync) /// - Smart skip logic (only process incomplete data) /// - Uses pending queue instead of retry mechanism /// - Content integrity validation with hash tracking /// /// # Persistence Strategy /// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state) /// - Log: companies_option_updates.log (append-only updates) /// - On restart: Load checkpoint + replay log /// - Periodic checkpoints (every 50 companies) /// - Batched fsync (every 10 writes or 10 seconds) /// - Hash validation of all option data directories pub async fn enrich_companies_with_option( paths: &DataPaths, _config: &Config, yahoo_pool: Arc, shutdown_flag: &Arc, ) -> anyhow::Result { // Configuration constants const CHECKPOINT_INTERVAL: usize = 50; const FSYNC_BATCH_SIZE: usize = 10; const FSYNC_INTERVAL_SECS: u64 = 10; const CONCURRENCY_LIMIT: usize = 50; let data_path = paths.data_dir(); // File paths let input_path = data_path.join("companies_yahoo_cleaned.jsonl"); let log_path = data_path.join("companies_option_updates.log"); // Check input exists if !input_path.exists() { logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping option enrichment").await; return Ok(0); } let manager = StateManager::new(paths.integrity_dir()).await?; let step_name = "yahoo_option_enrichment_complete"; if manager.is_step_valid(step_name).await? { logger::log_info(" Yahoo option enrichment already completed and valid").await; let count = checkpoint_helpers::count_enriched_companies(paths, "option").await?; logger::log_info(&format!(" ✓ Found {} companies with valid option data", count)).await; return Ok(count); } logger::log_info(" Option data needs refresh - starting enrichment").await; // === RECOVERY PHASE: Track enriched companies === let enriched_companies = checkpoint_helpers::load_enrichment_progress(&log_path).await?; // Load all companies from input logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await; let companies = load_companies_from_jsonl(&input_path).await?; let total_companies = companies.len(); logger::log_info(&format!("Found {} companies to process", total_companies)).await; // Filter companies that need enrichment let pending_companies: Vec = companies .into_iter() .filter(|company| !enriched_companies.contains(&company.name)) .collect(); let pending_count = pending_companies.len(); logger::log_info(&format!( " {} already enriched, {} pending", enriched_companies.len(), pending_count )).await; if pending_count == 0 { logger::log_info(" ✓ All companies already enriched").await; track_option_completion(&manager, paths, step_name).await?; return Ok(enriched_companies.len()); } // === PROCESSING PHASE: Enrich companies with option === // Create enrichment function let enrichment_fn: EnrichmentFn = Arc::new(move |company, pool, paths| { let company = company.clone(); let pool = Arc::clone(&pool); let paths = paths.clone(); Box::pin(async move { enrich_company_with_option(&company, &pool, &paths).await }) }); // Shared counters let processed_count = Arc::new(AtomicUsize::new(enriched_companies.len())); let success_count = Arc::new(AtomicUsize::new(enriched_companies.len())); let failed_count = Arc::new(AtomicUsize::new(0)); // Log writer channel with batching and fsync let (log_tx, log_rx) = mpsc::channel::(1000); // Spawn log writer task let log_writer_handle = spawn_log_writer( log_path, log_rx, Arc::clone(&processed_count), total_companies, FSYNC_BATCH_SIZE, FSYNC_INTERVAL_SECS, ); // Process companies concurrently with task panic isolation let mut tasks = FuturesUnordered::new(); let mut pending_iter = pending_companies.into_iter(); let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT)); // Initial batch of tasks for _ in 0..CONCURRENCY_LIMIT.min(pending_count) { if let Some(company) = pending_iter.next() { let task = spawn_enrichment_task( company, Arc::clone(&yahoo_pool), paths.clone(), Arc::clone(&processed_count), Arc::clone(&success_count), Arc::clone(&failed_count), log_tx.clone(), Arc::clone(&semaphore), Arc::clone(shutdown_flag), Arc::clone(&enrichment_fn), ); tasks.push(task); } } // Process tasks as they complete and spawn new ones let mut checkpoint_counter = 0; while let Some(_result) = tasks.next().await { // Check for shutdown if shutdown_flag.load(Ordering::SeqCst) { logger::log_warn("Shutdown signal received, stopping option enrichment").await; break; } // Checkpoint periodically checkpoint_counter += 1; if checkpoint_counter % CHECKPOINT_INTERVAL == 0 { let _ = log_tx.send(LogCommand::Checkpoint).await; } // Spawn next task if available if let Some(company) = pending_iter.next() { let task = spawn_enrichment_task( company, Arc::clone(&yahoo_pool), paths.clone(), Arc::clone(&processed_count), Arc::clone(&success_count), Arc::clone(&failed_count), log_tx.clone(), Arc::clone(&semaphore), Arc::clone(shutdown_flag), Arc::clone(&enrichment_fn), ); tasks.push(task); } } // Signal log writer to shutdown let _ = log_tx.send(LogCommand::Shutdown).await; drop(log_tx); // Wait for log writer to finish let _ = log_writer_handle.await; let final_processed = processed_count.load(Ordering::SeqCst); let final_success = success_count.load(Ordering::SeqCst); let final_failed = failed_count.load(Ordering::SeqCst); logger::log_info(&format!( " Option enrichment summary: {} total, {} success, {} failed", final_processed, final_success, final_failed )).await; // Mark as complete if all companies processed if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) { track_option_completion(&manager, paths, step_name).await?; logger::log_info(" ✓ Option enrichment marked as complete with integrity tracking").await; } Ok(final_success) } /// Track option enrichment completion with content hash verification async fn track_option_completion( manager: &StateManager, paths: &DataPaths, step_name: &str, ) -> anyhow::Result<()> { // Create content reference for all option data // This will hash ALL files matching the pattern: {company}/option/data.jsonl let content_reference = directory_reference( paths.corporate_dir(), Some(vec![ "*/option/*.jsonl".to_string(), // Main pattern for option data "*/option/data.jsonl".to_string(), // Specific pattern (more precise) ]), Some(vec![ "*.log".to_string(), // Exclude log files "*.tmp".to_string(), // Exclude temp files "*.bak".to_string(), // Exclude backup files ]), ); // Track completion with: // - Content reference: All option directories // - Data stage: Data (7-day TTL by default) // - Dependencies: Depends on cleaned companies data manager.update_entry( step_name.to_string(), content_reference, DataStage::Data, None, // Use default TTL (7 days for Data stage) ).await?; Ok(()) } /// Enrich a single company with option data async fn enrich_company_with_option( company: &CompanyData, yahoo_pool: &Arc, paths: &DataPaths, ) -> anyhow::Result<()> { let ticker = match extract_first_yahoo_ticker(company) { Some(t) => t, None => { return Err(anyhow::anyhow!("No valid Yahoo ticker found")); } }; // Get option data for all available expiration dates let option_data = yahoo_pool.get_option_data(&ticker, None).await?; // Only save if we got meaningful data if option_data.option.is_empty() { return Err(anyhow::anyhow!("No option data available")); } // Save the option data save_company_data(paths, &company.name, &option_data, "option").await?; Ok(()) } // ============================================================================ // CHART ENRICHMENT // ============================================================================ /// Yahoo Chart enrichment per corporate company /// /// # Features /// - Graceful shutdown (abort-safe) /// - Task panic isolation (tasks fail independently) /// - Crash-safe persistence (checkpoint + log with fsync) /// - Smart skip logic (only process incomplete data) /// - Uses pending queue instead of retry mechanism /// - Content integrity validation with hash tracking /// /// # Persistence Strategy /// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state) /// - Log: companies_chart_updates.log (append-only updates) /// - On restart: Load checkpoint + replay log /// - Periodic checkpoints (every 50 companies) /// - Batched fsync (every 10 writes or 10 seconds) /// - Hash validation of all chart data directories pub async fn enrich_companies_with_chart( paths: &DataPaths, _config: &Config, yahoo_pool: Arc, shutdown_flag: &Arc, ) -> anyhow::Result { // Configuration constants const CHECKPOINT_INTERVAL: usize = 50; const FSYNC_BATCH_SIZE: usize = 10; const FSYNC_INTERVAL_SECS: u64 = 10; const CONCURRENCY_LIMIT: usize = 50; let data_path = paths.data_dir(); // File paths let input_path = data_path.join("companies_yahoo_cleaned.jsonl"); let log_path = data_path.join("companies_chart_updates.log"); // Check input exists if !input_path.exists() { logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping chart enrichment").await; return Ok(0); } let manager = StateManager::new(paths.integrity_dir()).await?; let step_name = "yahoo_chart_enrichment_complete"; if manager.is_step_valid(step_name).await? { logger::log_info(" Yahoo chart enrichment already completed and valid").await; let count = checkpoint_helpers::count_enriched_companies(paths, "chart").await?; logger::log_info(&format!(" ✓ Found {} companies with valid chart data", count)).await; return Ok(count); } logger::log_info(" Chart data needs refresh - starting enrichment").await; // === RECOVERY PHASE: Track enriched companies === let enriched_companies = checkpoint_helpers::load_enrichment_progress(&log_path).await?; // Load all companies from input logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await; let companies = load_companies_from_jsonl(&input_path).await?; let total_companies = companies.len(); logger::log_info(&format!("Found {} companies to process", total_companies)).await; // Filter companies that need enrichment let pending_companies: Vec = companies .into_iter() .filter(|company| !enriched_companies.contains(&company.name)) .collect(); let pending_count = pending_companies.len(); logger::log_info(&format!( " {} already enriched, {} pending", enriched_companies.len(), pending_count )).await; if pending_count == 0 { logger::log_info(" ✓ All companies already enriched").await; track_chart_completion(&manager, paths, step_name).await?; return Ok(enriched_companies.len()); } // === PROCESSING PHASE: Enrich companies with chart === // Create enrichment function let enrichment_fn: EnrichmentFn = Arc::new(move |company, pool, paths| { let company = company.clone(); let pool = Arc::clone(&pool); let paths = paths.clone(); Box::pin(async move { enrich_company_with_chart(&company, &pool, &paths).await }) }); // Shared counters let processed_count = Arc::new(AtomicUsize::new(enriched_companies.len())); let success_count = Arc::new(AtomicUsize::new(enriched_companies.len())); let failed_count = Arc::new(AtomicUsize::new(0)); // Log writer channel with batching and fsync let (log_tx, log_rx) = mpsc::channel::(1000); // Spawn log writer task let log_writer_handle = spawn_log_writer( log_path, log_rx, Arc::clone(&processed_count), total_companies, FSYNC_BATCH_SIZE, FSYNC_INTERVAL_SECS, ); // Process companies concurrently with task panic isolation let mut tasks = FuturesUnordered::new(); let mut pending_iter = pending_companies.into_iter(); let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT)); // Initial batch of tasks for _ in 0..CONCURRENCY_LIMIT.min(pending_count) { if let Some(company) = pending_iter.next() { let task = spawn_enrichment_task( company, Arc::clone(&yahoo_pool), paths.clone(), Arc::clone(&processed_count), Arc::clone(&success_count), Arc::clone(&failed_count), log_tx.clone(), Arc::clone(&semaphore), Arc::clone(shutdown_flag), Arc::clone(&enrichment_fn), ); tasks.push(task); } } // Process tasks as they complete and spawn new ones let mut checkpoint_counter = 0; while let Some(_result) = tasks.next().await { // Check for shutdown if shutdown_flag.load(Ordering::SeqCst) { logger::log_warn("Shutdown signal received, stopping chart enrichment").await; break; } // Checkpoint periodically checkpoint_counter += 1; if checkpoint_counter % CHECKPOINT_INTERVAL == 0 { let _ = log_tx.send(LogCommand::Checkpoint).await; } // Spawn next task if available if let Some(company) = pending_iter.next() { let task = spawn_enrichment_task( company, Arc::clone(&yahoo_pool), paths.clone(), Arc::clone(&processed_count), Arc::clone(&success_count), Arc::clone(&failed_count), log_tx.clone(), Arc::clone(&semaphore), Arc::clone(shutdown_flag), Arc::clone(&enrichment_fn), ); tasks.push(task); } } // Signal log writer to shutdown let _ = log_tx.send(LogCommand::Shutdown).await; drop(log_tx); // Wait for log writer to finish let _ = log_writer_handle.await; let final_processed = processed_count.load(Ordering::SeqCst); let final_success = success_count.load(Ordering::SeqCst); let final_failed = failed_count.load(Ordering::SeqCst); logger::log_info(&format!( " Chart enrichment summary: {} total, {} success, {} failed", final_processed, final_success, final_failed )).await; // Mark as complete if all companies processed if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) { track_chart_completion(&manager, paths, step_name).await?; logger::log_info(" ✓ Chart enrichment marked as complete with integrity tracking").await; } Ok(final_success) } /// Track chart enrichment completion with content hash verification async fn track_chart_completion( manager: &StateManager, paths: &DataPaths, step_name: &str, ) -> anyhow::Result<()> { // Create content reference for all chart data // This will hash ALL files matching the pattern: {company}/chart/data.jsonl let content_reference = directory_reference( paths.corporate_dir(), Some(vec![ "*/chart/*.jsonl".to_string(), // Main pattern for chart data "*/chart/data.jsonl".to_string(), // Specific pattern (more precise) ]), Some(vec![ "*.log".to_string(), // Exclude log files "*.tmp".to_string(), // Exclude temp files "*.bak".to_string(), // Exclude backup files ]), ); // Track completion with: // - Content reference: All chart directories // - Data stage: Data (7-day TTL by default) // - Dependencies: Depends on cleaned companies data manager.update_entry( step_name.to_string(), content_reference, DataStage::Data, None, // Use default TTL (7 days for Data stage) ).await?; Ok(()) } /// Enrich a single company with chart data async fn enrich_company_with_chart( company: &CompanyData, yahoo_pool: &Arc, paths: &DataPaths, ) -> anyhow::Result<()> { let ticker = match extract_first_yahoo_ticker(company) { Some(t) => t, None => { return Err(anyhow::anyhow!("No valid Yahoo ticker found")); } }; // Get 1 year of daily chart data let now = chrono::Utc::now().timestamp(); let start = chrono::Utc .with_ymd_and_hms(2000, 1, 1, 0, 0, 0) .unwrap() .timestamp(); let chart_data = yahoo_pool.get_chart_data(&ticker, "1d", start, now).await?; // Only save if we got meaningful data if chart_data.quotes.is_empty() { return Err(anyhow::anyhow!("No chart data available")); } // Save the chart data save_company_data(paths, &company.name, &chart_data, "chart").await?; Ok(()) } /// Save data to company directory (generic version) async fn save_company_data( paths: &DataPaths, company_name: &str, data: &T, data_type: &str, ) -> anyhow::Result<()> { use tokio::fs; let safe_name = sanitize_company_name(company_name); let company_dir = paths.corporate_dir().join(&safe_name).join(data_type); fs::create_dir_all(&company_dir).await?; let data_path = company_dir.join("data.jsonl"); let json_line = serde_json::to_string(data)?; let mut file = fs::File::create(&data_path).await?; file.write_all(json_line.as_bytes()).await?; file.write_all(b"\n").await?; file.flush().await?; file.sync_all().await?; // Ensure data is persisted Ok(()) } // ============================================================================ // GENERIC SHARED FUNCTIONS // ============================================================================ /// Spawn log writer task (shared across all enrichment types) fn spawn_log_writer( log_path: std::path::PathBuf, mut log_rx: mpsc::Receiver, processed_count: Arc, total_companies: usize, fsync_batch_size: usize, fsync_interval_secs: u64, ) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { let mut log_file = OpenOptions::new() .create(true) .append(true) .open(&log_path) .await .expect("Failed to open log file"); let mut write_count = 0; let mut last_fsync = tokio::time::Instant::now(); while let Some(cmd) = log_rx.recv().await { match cmd { LogCommand::Write(entry) => { let json_line = serde_json::to_string(&entry).expect("Serialization failed"); log_file.write_all(json_line.as_bytes()).await.expect("Write failed"); log_file.write_all(b"\n").await.expect("Write failed"); write_count += 1; // Batched fsync if write_count >= fsync_batch_size || last_fsync.elapsed().as_secs() >= fsync_interval_secs { log_file.flush().await.expect("Flush failed"); log_file.sync_all().await.expect("Fsync failed"); write_count = 0; last_fsync = tokio::time::Instant::now(); } } LogCommand::Checkpoint => { // Force fsync on checkpoint log_file.flush().await.expect("Flush failed"); log_file.sync_all().await.expect("Fsync failed"); write_count = 0; last_fsync = tokio::time::Instant::now(); let current = processed_count.load(Ordering::SeqCst); logger::log_info(&format!( " Checkpoint: {}/{} companies processed", current, total_companies )).await; } LogCommand::Shutdown => { // Final fsync before shutdown log_file.flush().await.expect("Flush failed"); log_file.sync_all().await.expect("Fsync failed"); break; } } } }) } /// Spawn a single enrichment task with panic isolation (GENERIC VERSION) /// /// This generic version accepts an enrichment function as a parameter, /// allowing it to be reused for events, options, charts, or any other enrichment type. /// /// # Parameters /// - `company`: The company to enrich /// - `yahoo_pool`: Yahoo API client pool /// - `paths`: Data paths /// - `processed_count`: Counter for processed companies /// - `success_count`: Counter for successful enrichments /// - `failed_count`: Counter for failed enrichments /// - `log_tx`: Channel to send log commands /// - `semaphore`: Semaphore for concurrency control /// - `shutdown_flag`: Flag to signal shutdown /// - `enrichment_fn`: The specific enrichment function to call (events, option, chart, etc.) fn spawn_enrichment_task( company: CompanyData, yahoo_pool: Arc, paths: DataPaths, processed_count: Arc, success_count: Arc, failed_count: Arc, log_tx: mpsc::Sender, semaphore: Arc, shutdown_flag: Arc, enrichment_fn: EnrichmentFn, ) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { // Acquire semaphore permit let _permit = semaphore.acquire().await.expect("Semaphore closed"); // Check shutdown before processing if shutdown_flag.load(Ordering::SeqCst) { return; } // Call the enrichment function (this is where the type-specific logic happens) let result = enrichment_fn(company.clone(), Arc::clone(&yahoo_pool), paths).await; // Update counters processed_count.fetch_add(1, Ordering::SeqCst); let status = match result { Ok(_) => { success_count.fetch_add(1, Ordering::SeqCst); "enriched" } Err(e) => { failed_count.fetch_add(1, Ordering::SeqCst); logger::log_warn(&format!( " Failed to enrich {}: {}", company.name, e )).await; "failed" } }; // Log result let log_entry = json!({ "company_name": company.name, "status": status, "timestamp": Utc::now().to_rfc3339(), }); let _ = log_tx.send(LogCommand::Write(log_entry)).await; }) }