Files
WebScraper/src/corporate/update_companies_enrich.rs

1058 lines
37 KiB
Rust

// src/corporate/update_companies_enrich.rs - MERGED VERSION WITH GENERIC ENRICHMENT
use super::{types::*, helpers::*};
use crate::config::Config;
use crate::corporate::checkpoint_helpers;
use crate::util::directories::DataPaths;
use crate::util::integrity::{StateManager, directory_reference, DataStage};
use crate::util::logger;
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
use std::result::Result::Ok;
use chrono::{TimeZone, Utc};
use std::collections::{HashSet};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::fs::{OpenOptions};
use tokio::io::{AsyncWriteExt};
use futures::stream::{FuturesUnordered, StreamExt};
use serde_json::json;
use tokio::sync::mpsc;
use std::future::Future;
use std::pin::Pin;
/// Log command enum (shared across all enrichment types)
enum LogCommand {
Write(serde_json::Value),
Checkpoint,
Shutdown,
}
/// Type alias for enrichment function
type EnrichmentFn = Arc<
dyn Fn(CompanyData, Arc<YahooClientPool>, DataPaths)
-> Pin<Box<dyn Future<Output = anyhow::Result<()>> + Send>>
+ Send
+ Sync
>;
// ============================================================================
// EVENTS ENRICHMENT
// ============================================================================
/// Yahoo Event enrichment per corporate company
///
/// # Features
/// - Graceful shutdown (abort-safe)
/// - Task panic isolation (tasks fail independently)
/// - Crash-safe persistence (checkpoint + log with fsync)
/// - Smart skip logic (only process incomplete data)
/// - Uses pending queue instead of retry mechanism
/// - Content integrity validation with hash tracking
///
/// # Persistence Strategy
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
/// - Log: companies_events_updates.log (append-only updates)
/// - On restart: Load checkpoint + replay log
/// - Periodic checkpoints (every 50 companies)
/// - Batched fsync (every 10 writes or 10 seconds)
/// - Hash validation of all event data directories
pub async fn enrich_companies_with_events(
paths: &DataPaths,
_config: &Config,
yahoo_pool: Arc<YahooClientPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 50;
let data_path = paths.data_dir();
// File paths
let input_path = data_path.join("companies_yahoo_cleaned.jsonl");
let log_path = data_path.join("companies_events_updates.log");
// Check input exists
if !input_path.exists() {
logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping event enrichment").await;
return Ok(0);
}
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_events_enrichment_complete";
if manager.is_step_valid(step_name).await? {
logger::log_info(" Yahoo events enrichment already completed and valid").await;
// Count enriched companies
let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?;
logger::log_info(&format!(" ✓ Found {} companies with valid event data", count)).await;
return Ok(count);
}
logger::log_info(" Event data needs refresh - starting enrichment").await;
// === RECOVERY PHASE: Track enriched companies ===
let enriched_companies: HashSet<String> = checkpoint_helpers::load_enrichment_progress(&log_path).await?;
// Load all companies from input
logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await;
let companies = load_companies_from_jsonl(&input_path).await?;
let total_companies = companies.len();
logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment
let pending_companies: Vec<CompanyData> = companies
.into_iter()
.filter(|company| !enriched_companies.contains(&company.name))
.collect();
let pending_count = pending_companies.len();
logger::log_info(&format!(
" {} already enriched, {} pending",
enriched_companies.len(),
pending_count
)).await;
if pending_count == 0 {
logger::log_info(" ✓ All companies already enriched").await;
track_events_completion(&manager, paths, step_name).await?;
return Ok(enriched_companies.len());
}
// === PROCESSING PHASE: Enrich companies with events ===
// Create enrichment function
let enrichment_fn: EnrichmentFn = Arc::new(move |company, pool, paths| {
let company = company.clone();
let pool = Arc::clone(&pool);
let paths = paths.clone();
Box::pin(async move {
enrich_company_with_events(&company, &pool, &paths).await
})
});
// Shared counters
let processed_count = Arc::new(AtomicUsize::new(enriched_companies.len()));
let success_count = Arc::new(AtomicUsize::new(enriched_companies.len()));
let failed_count = Arc::new(AtomicUsize::new(0));
// Log writer channel with batching and fsync
let (log_tx, log_rx) = mpsc::channel::<LogCommand>(1000);
// Spawn log writer task
let log_writer_handle = spawn_log_writer(
log_path,
log_rx,
Arc::clone(&processed_count),
total_companies,
FSYNC_BATCH_SIZE,
FSYNC_INTERVAL_SECS,
);
// Process companies concurrently with task panic isolation
let mut tasks = FuturesUnordered::new();
let mut pending_iter = pending_companies.into_iter();
let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT));
// Initial batch of tasks
for _ in 0..CONCURRENCY_LIMIT.min(pending_count) {
if let Some(company) = pending_iter.next() {
let task = spawn_enrichment_task(
company,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
Arc::clone(&enrichment_fn),
);
tasks.push(task);
}
}
// Process results and spawn new tasks
let mut checkpoint_counter = enriched_companies.len();
while let Some(result) = tasks.next().await {
// Handle task result (even if panicked)
match result {
Ok(_) => {
// Task completed successfully
}
Err(e) => {
logger::log_warn(&format!("Task panicked: {}", e)).await;
failed_count.fetch_add(1, Ordering::SeqCst);
}
}
// Check for shutdown
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown signal received, stopping event enrichment").await;
break;
}
// Checkpoint periodically
checkpoint_counter += 1;
if checkpoint_counter % CHECKPOINT_INTERVAL == 0 {
let _ = log_tx.send(LogCommand::Checkpoint).await;
}
// Spawn next task if available
if let Some(company) = pending_iter.next() {
let task = spawn_enrichment_task(
company,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
Arc::clone(&enrichment_fn),
);
tasks.push(task);
}
}
// Signal log writer to shutdown
let _ = log_tx.send(LogCommand::Shutdown).await;
drop(log_tx);
// Wait for log writer to finish
let _ = log_writer_handle.await;
let final_processed = processed_count.load(Ordering::SeqCst);
let final_success = success_count.load(Ordering::SeqCst);
let final_failed = failed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
" Event enrichment summary: {} total, {} success, {} failed",
final_processed, final_success, final_failed
)).await;
// Mark as complete if all companies processed
if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) {
track_events_completion(&manager, paths, step_name).await?;
logger::log_info(" ✓ Event enrichment marked as complete with integrity tracking").await;
}
Ok(final_success)
}
/// Track event enrichment completion with content hash verification
async fn track_events_completion(
manager: &StateManager,
paths: &DataPaths,
step_name: &str,
) -> anyhow::Result<()> {
// Create content reference for all event data
// This will hash ALL files matching the pattern: {company}/events/data.jsonl
let content_reference = directory_reference(
paths.corporate_dir(),
Some(vec![
"*/events/*.jsonl".to_string(), // Main pattern for events data
"*/events/data.jsonl".to_string(), // Specific pattern (more precise)
]),
Some(vec![
"*.log".to_string(), // Exclude log files
"*.tmp".to_string(), // Exclude temp files
"*.bak".to_string(), // Exclude backup files
]),
);
// Track completion with:
// - Content reference: All event directories
// - Data stage: Data (7-day TTL by default)
// - Dependencies: Depends on cleaned companies data
manager.update_entry(
step_name.to_string(),
content_reference,
DataStage::Data,
None, // Use default TTL (7 days for Data stage)
).await?;
Ok(())
}
/// Enrich a single company with event data
async fn enrich_company_with_events(
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> anyhow::Result<()> {
use std::collections::HashMap;
let ticker = match extract_first_yahoo_ticker(company) {
Some(t) => t,
None => {
return Err(anyhow::anyhow!("No valid Yahoo ticker found"));
}
};
// Combined summary to accumulate data from all available modules
let mut combined_modules: HashMap<String, serde_json::Value> = HashMap::new();
let timestamp = chrono::Utc::now().timestamp();
// Try each event module individually
let event_modules = QuoteSummaryModule::event_modules();
for module in event_modules {
match yahoo_pool.get_quote_summary(&ticker, &[module]).await {
Ok(summary) => {
// Merge this module's data into combined summary
for (key, value) in summary.modules {
combined_modules.insert(key, value);
}
}
Err(e) => {
// Module not available - silently continue for expected errors
let err_str = e.to_string();
if err_str.contains("500") || err_str.contains("404") || err_str.contains("Not Found") {
// Expected for securities without this data - continue silently
continue;
} else {
// Unexpected error - log but continue trying other modules
logger::log_warn(&format!(
" Unexpected error fetching event module for {}: {}",
ticker, e
)).await;
}
}
}
}
// Only save if we got at least some data
if combined_modules.is_empty() {
return Err(anyhow::anyhow!("No event data available for any module"));
}
// Create combined summary with all available modules
let combined_summary = crate::scraper::yahoo::QuoteSummary {
symbol: ticker.clone(),
modules: combined_modules,
timestamp,
};
// Save the combined event data
save_company_event_data(paths, &company.name, &combined_summary).await?;
Ok(())
}
/// Save event data to company directory
async fn save_company_event_data(
paths: &DataPaths,
company_name: &str,
summary: &crate::scraper::yahoo::QuoteSummary,
) -> anyhow::Result<()> {
use tokio::fs;
let safe_name = sanitize_company_name(company_name);
let company_dir = paths.corporate_dir().join(&safe_name).join("events");
fs::create_dir_all(&company_dir).await?;
let data_path = company_dir.join("data.jsonl");
let json_line = serde_json::to_string(summary)?;
let mut file = fs::File::create(&data_path).await?;
file.write_all(json_line.as_bytes()).await?;
file.write_all(b"\n").await?;
file.flush().await?;
file.sync_all().await?; // Ensure data is persisted
Ok(())
}
// ============================================================================
// OPTION ENRICHMENT
// ============================================================================
/// Yahoo Option enrichment per corporate company
///
/// # Features
/// - Graceful shutdown (abort-safe)
/// - Task panic isolation (tasks fail independently)
/// - Crash-safe persistence (checkpoint + log with fsync)
/// - Smart skip logic (only process incomplete data)
/// - Uses pending queue instead of retry mechanism
/// - Content integrity validation with hash tracking
///
/// # Persistence Strategy
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
/// - Log: companies_option_updates.log (append-only updates)
/// - On restart: Load checkpoint + replay log
/// - Periodic checkpoints (every 50 companies)
/// - Batched fsync (every 10 writes or 10 seconds)
/// - Hash validation of all option data directories
pub async fn enrich_companies_with_option(
paths: &DataPaths,
_config: &Config,
yahoo_pool: Arc<YahooClientPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 50;
let data_path = paths.data_dir();
// File paths
let input_path = data_path.join("companies_yahoo_cleaned.jsonl");
let log_path = data_path.join("companies_option_updates.log");
// Check input exists
if !input_path.exists() {
logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping option enrichment").await;
return Ok(0);
}
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_option_enrichment_complete";
if manager.is_step_valid(step_name).await? {
logger::log_info(" Yahoo option enrichment already completed and valid").await;
let count = checkpoint_helpers::count_enriched_companies(paths, "option").await?;
logger::log_info(&format!(" ✓ Found {} companies with valid option data", count)).await;
return Ok(count);
}
logger::log_info(" Option data needs refresh - starting enrichment").await;
// === RECOVERY PHASE: Track enriched companies ===
let enriched_companies = checkpoint_helpers::load_enrichment_progress(&log_path).await?;
// Load all companies from input
logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await;
let companies = load_companies_from_jsonl(&input_path).await?;
let total_companies = companies.len();
logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment
let pending_companies: Vec<CompanyData> = companies
.into_iter()
.filter(|company| !enriched_companies.contains(&company.name))
.collect();
let pending_count = pending_companies.len();
logger::log_info(&format!(
" {} already enriched, {} pending",
enriched_companies.len(),
pending_count
)).await;
if pending_count == 0 {
logger::log_info(" ✓ All companies already enriched").await;
track_option_completion(&manager, paths, step_name).await?;
return Ok(enriched_companies.len());
}
// === PROCESSING PHASE: Enrich companies with option ===
// Create enrichment function
let enrichment_fn: EnrichmentFn = Arc::new(move |company, pool, paths| {
let company = company.clone();
let pool = Arc::clone(&pool);
let paths = paths.clone();
Box::pin(async move {
enrich_company_with_option(&company, &pool, &paths).await
})
});
// Shared counters
let processed_count = Arc::new(AtomicUsize::new(enriched_companies.len()));
let success_count = Arc::new(AtomicUsize::new(enriched_companies.len()));
let failed_count = Arc::new(AtomicUsize::new(0));
// Log writer channel with batching and fsync
let (log_tx, log_rx) = mpsc::channel::<LogCommand>(1000);
// Spawn log writer task
let log_writer_handle = spawn_log_writer(
log_path,
log_rx,
Arc::clone(&processed_count),
total_companies,
FSYNC_BATCH_SIZE,
FSYNC_INTERVAL_SECS,
);
// Process companies concurrently with task panic isolation
let mut tasks = FuturesUnordered::new();
let mut pending_iter = pending_companies.into_iter();
let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT));
// Initial batch of tasks
for _ in 0..CONCURRENCY_LIMIT.min(pending_count) {
if let Some(company) = pending_iter.next() {
let task = spawn_enrichment_task(
company,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
Arc::clone(&enrichment_fn),
);
tasks.push(task);
}
}
// Process tasks as they complete and spawn new ones
let mut checkpoint_counter = 0;
while let Some(_result) = tasks.next().await {
// Check for shutdown
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown signal received, stopping option enrichment").await;
break;
}
// Checkpoint periodically
checkpoint_counter += 1;
if checkpoint_counter % CHECKPOINT_INTERVAL == 0 {
let _ = log_tx.send(LogCommand::Checkpoint).await;
}
// Spawn next task if available
if let Some(company) = pending_iter.next() {
let task = spawn_enrichment_task(
company,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
Arc::clone(&enrichment_fn),
);
tasks.push(task);
}
}
// Signal log writer to shutdown
let _ = log_tx.send(LogCommand::Shutdown).await;
drop(log_tx);
// Wait for log writer to finish
let _ = log_writer_handle.await;
let final_processed = processed_count.load(Ordering::SeqCst);
let final_success = success_count.load(Ordering::SeqCst);
let final_failed = failed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
" Option enrichment summary: {} total, {} success, {} failed",
final_processed, final_success, final_failed
)).await;
// Mark as complete if all companies processed
if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) {
track_option_completion(&manager, paths, step_name).await?;
logger::log_info(" ✓ Option enrichment marked as complete with integrity tracking").await;
}
Ok(final_success)
}
/// Track option enrichment completion with content hash verification
async fn track_option_completion(
manager: &StateManager,
paths: &DataPaths,
step_name: &str,
) -> anyhow::Result<()> {
// Create content reference for all option data
// This will hash ALL files matching the pattern: {company}/option/data.jsonl
let content_reference = directory_reference(
paths.corporate_dir(),
Some(vec![
"*/option/*.jsonl".to_string(), // Main pattern for option data
"*/option/data.jsonl".to_string(), // Specific pattern (more precise)
]),
Some(vec![
"*.log".to_string(), // Exclude log files
"*.tmp".to_string(), // Exclude temp files
"*.bak".to_string(), // Exclude backup files
]),
);
// Track completion with:
// - Content reference: All option directories
// - Data stage: Data (7-day TTL by default)
// - Dependencies: Depends on cleaned companies data
manager.update_entry(
step_name.to_string(),
content_reference,
DataStage::Data,
None, // Use default TTL (7 days for Data stage)
).await?;
Ok(())
}
/// Enrich a single company with option data
async fn enrich_company_with_option(
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> anyhow::Result<()> {
let ticker = match extract_first_yahoo_ticker(company) {
Some(t) => t,
None => {
return Err(anyhow::anyhow!("No valid Yahoo ticker found"));
}
};
// Get option data for all available expiration dates
let option_data = yahoo_pool.get_option_data(&ticker, None).await?;
// Only save if we got meaningful data
if option_data.option.is_empty() {
return Err(anyhow::anyhow!("No option data available"));
}
// Save the option data
save_company_data(paths, &company.name, &option_data, "option").await?;
Ok(())
}
// ============================================================================
// CHART ENRICHMENT
// ============================================================================
/// Yahoo Chart enrichment per corporate company
///
/// # Features
/// - Graceful shutdown (abort-safe)
/// - Task panic isolation (tasks fail independently)
/// - Crash-safe persistence (checkpoint + log with fsync)
/// - Smart skip logic (only process incomplete data)
/// - Uses pending queue instead of retry mechanism
/// - Content integrity validation with hash tracking
///
/// # Persistence Strategy
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
/// - Log: companies_chart_updates.log (append-only updates)
/// - On restart: Load checkpoint + replay log
/// - Periodic checkpoints (every 50 companies)
/// - Batched fsync (every 10 writes or 10 seconds)
/// - Hash validation of all chart data directories
pub async fn enrich_companies_with_chart(
paths: &DataPaths,
_config: &Config,
yahoo_pool: Arc<YahooClientPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 50;
let data_path = paths.data_dir();
// File paths
let input_path = data_path.join("companies_yahoo_cleaned.jsonl");
let log_path = data_path.join("companies_chart_updates.log");
// Check input exists
if !input_path.exists() {
logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping chart enrichment").await;
return Ok(0);
}
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_chart_enrichment_complete";
if manager.is_step_valid(step_name).await? {
logger::log_info(" Yahoo chart enrichment already completed and valid").await;
let count = checkpoint_helpers::count_enriched_companies(paths, "chart").await?;
logger::log_info(&format!(" ✓ Found {} companies with valid chart data", count)).await;
return Ok(count);
}
logger::log_info(" Chart data needs refresh - starting enrichment").await;
// === RECOVERY PHASE: Track enriched companies ===
let enriched_companies = checkpoint_helpers::load_enrichment_progress(&log_path).await?;
// Load all companies from input
logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await;
let companies = load_companies_from_jsonl(&input_path).await?;
let total_companies = companies.len();
logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment
let pending_companies: Vec<CompanyData> = companies
.into_iter()
.filter(|company| !enriched_companies.contains(&company.name))
.collect();
let pending_count = pending_companies.len();
logger::log_info(&format!(
" {} already enriched, {} pending",
enriched_companies.len(),
pending_count
)).await;
if pending_count == 0 {
logger::log_info(" ✓ All companies already enriched").await;
track_chart_completion(&manager, paths, step_name).await?;
return Ok(enriched_companies.len());
}
// === PROCESSING PHASE: Enrich companies with chart ===
// Create enrichment function
let enrichment_fn: EnrichmentFn = Arc::new(move |company, pool, paths| {
let company = company.clone();
let pool = Arc::clone(&pool);
let paths = paths.clone();
Box::pin(async move {
enrich_company_with_chart(&company, &pool, &paths).await
})
});
// Shared counters
let processed_count = Arc::new(AtomicUsize::new(enriched_companies.len()));
let success_count = Arc::new(AtomicUsize::new(enriched_companies.len()));
let failed_count = Arc::new(AtomicUsize::new(0));
// Log writer channel with batching and fsync
let (log_tx, log_rx) = mpsc::channel::<LogCommand>(1000);
// Spawn log writer task
let log_writer_handle = spawn_log_writer(
log_path,
log_rx,
Arc::clone(&processed_count),
total_companies,
FSYNC_BATCH_SIZE,
FSYNC_INTERVAL_SECS,
);
// Process companies concurrently with task panic isolation
let mut tasks = FuturesUnordered::new();
let mut pending_iter = pending_companies.into_iter();
let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT));
// Initial batch of tasks
for _ in 0..CONCURRENCY_LIMIT.min(pending_count) {
if let Some(company) = pending_iter.next() {
let task = spawn_enrichment_task(
company,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
Arc::clone(&enrichment_fn),
);
tasks.push(task);
}
}
// Process tasks as they complete and spawn new ones
let mut checkpoint_counter = 0;
while let Some(_result) = tasks.next().await {
// Check for shutdown
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown signal received, stopping chart enrichment").await;
break;
}
// Checkpoint periodically
checkpoint_counter += 1;
if checkpoint_counter % CHECKPOINT_INTERVAL == 0 {
let _ = log_tx.send(LogCommand::Checkpoint).await;
}
// Spawn next task if available
if let Some(company) = pending_iter.next() {
let task = spawn_enrichment_task(
company,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
Arc::clone(&enrichment_fn),
);
tasks.push(task);
}
}
// Signal log writer to shutdown
let _ = log_tx.send(LogCommand::Shutdown).await;
drop(log_tx);
// Wait for log writer to finish
let _ = log_writer_handle.await;
let final_processed = processed_count.load(Ordering::SeqCst);
let final_success = success_count.load(Ordering::SeqCst);
let final_failed = failed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
" Chart enrichment summary: {} total, {} success, {} failed",
final_processed, final_success, final_failed
)).await;
// Mark as complete if all companies processed
if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) {
track_chart_completion(&manager, paths, step_name).await?;
logger::log_info(" ✓ Chart enrichment marked as complete with integrity tracking").await;
}
Ok(final_success)
}
/// Track chart enrichment completion with content hash verification
async fn track_chart_completion(
manager: &StateManager,
paths: &DataPaths,
step_name: &str,
) -> anyhow::Result<()> {
// Create content reference for all chart data
// This will hash ALL files matching the pattern: {company}/chart/data.jsonl
let content_reference = directory_reference(
paths.corporate_dir(),
Some(vec![
"*/chart/*.jsonl".to_string(), // Main pattern for chart data
"*/chart/data.jsonl".to_string(), // Specific pattern (more precise)
]),
Some(vec![
"*.log".to_string(), // Exclude log files
"*.tmp".to_string(), // Exclude temp files
"*.bak".to_string(), // Exclude backup files
]),
);
// Track completion with:
// - Content reference: All chart directories
// - Data stage: Data (7-day TTL by default)
// - Dependencies: Depends on cleaned companies data
manager.update_entry(
step_name.to_string(),
content_reference,
DataStage::Data,
None, // Use default TTL (7 days for Data stage)
).await?;
Ok(())
}
/// Enrich a single company with chart data
async fn enrich_company_with_chart(
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> anyhow::Result<()> {
let ticker = match extract_first_yahoo_ticker(company) {
Some(t) => t,
None => {
return Err(anyhow::anyhow!("No valid Yahoo ticker found"));
}
};
// Get 1 year of daily chart data
let now = chrono::Utc::now().timestamp();
let start = chrono::Utc
.with_ymd_and_hms(2000, 1, 1, 0, 0, 0)
.unwrap()
.timestamp();
let chart_data = yahoo_pool.get_chart_data(&ticker, "1d", start, now).await?;
// Only save if we got meaningful data
if chart_data.quotes.is_empty() {
return Err(anyhow::anyhow!("No chart data available"));
}
// Save the chart data
save_company_data(paths, &company.name, &chart_data, "chart").await?;
Ok(())
}
/// Save data to company directory (generic version)
async fn save_company_data<T: serde::Serialize>(
paths: &DataPaths,
company_name: &str,
data: &T,
data_type: &str,
) -> anyhow::Result<()> {
use tokio::fs;
let safe_name = sanitize_company_name(company_name);
let company_dir = paths.corporate_dir().join(&safe_name).join(data_type);
fs::create_dir_all(&company_dir).await?;
let data_path = company_dir.join("data.jsonl");
let json_line = serde_json::to_string(data)?;
let mut file = fs::File::create(&data_path).await?;
file.write_all(json_line.as_bytes()).await?;
file.write_all(b"\n").await?;
file.flush().await?;
file.sync_all().await?; // Ensure data is persisted
Ok(())
}
// ============================================================================
// GENERIC SHARED FUNCTIONS
// ============================================================================
/// Spawn log writer task (shared across all enrichment types)
fn spawn_log_writer(
log_path: std::path::PathBuf,
mut log_rx: mpsc::Receiver<LogCommand>,
processed_count: Arc<AtomicUsize>,
total_companies: usize,
fsync_batch_size: usize,
fsync_interval_secs: u64,
) -> tokio::task::JoinHandle<()> {
tokio::spawn(async move {
let mut log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await
.expect("Failed to open log file");
let mut write_count = 0;
let mut last_fsync = tokio::time::Instant::now();
while let Some(cmd) = log_rx.recv().await {
match cmd {
LogCommand::Write(entry) => {
let json_line = serde_json::to_string(&entry).expect("Serialization failed");
log_file.write_all(json_line.as_bytes()).await.expect("Write failed");
log_file.write_all(b"\n").await.expect("Write failed");
write_count += 1;
// Batched fsync
if write_count >= fsync_batch_size
|| last_fsync.elapsed().as_secs() >= fsync_interval_secs
{
log_file.flush().await.expect("Flush failed");
log_file.sync_all().await.expect("Fsync failed");
write_count = 0;
last_fsync = tokio::time::Instant::now();
}
}
LogCommand::Checkpoint => {
// Force fsync on checkpoint
log_file.flush().await.expect("Flush failed");
log_file.sync_all().await.expect("Fsync failed");
write_count = 0;
last_fsync = tokio::time::Instant::now();
let current = processed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
" Checkpoint: {}/{} companies processed",
current, total_companies
)).await;
}
LogCommand::Shutdown => {
// Final fsync before shutdown
log_file.flush().await.expect("Flush failed");
log_file.sync_all().await.expect("Fsync failed");
break;
}
}
}
})
}
/// Spawn a single enrichment task with panic isolation (GENERIC VERSION)
///
/// This generic version accepts an enrichment function as a parameter,
/// allowing it to be reused for events, options, charts, or any other enrichment type.
///
/// # Parameters
/// - `company`: The company to enrich
/// - `yahoo_pool`: Yahoo API client pool
/// - `paths`: Data paths
/// - `processed_count`: Counter for processed companies
/// - `success_count`: Counter for successful enrichments
/// - `failed_count`: Counter for failed enrichments
/// - `log_tx`: Channel to send log commands
/// - `semaphore`: Semaphore for concurrency control
/// - `shutdown_flag`: Flag to signal shutdown
/// - `enrichment_fn`: The specific enrichment function to call (events, option, chart, etc.)
fn spawn_enrichment_task(
company: CompanyData,
yahoo_pool: Arc<YahooClientPool>,
paths: DataPaths,
processed_count: Arc<AtomicUsize>,
success_count: Arc<AtomicUsize>,
failed_count: Arc<AtomicUsize>,
log_tx: mpsc::Sender<LogCommand>,
semaphore: Arc<tokio::sync::Semaphore>,
shutdown_flag: Arc<AtomicBool>,
enrichment_fn: EnrichmentFn,
) -> tokio::task::JoinHandle<()> {
tokio::spawn(async move {
// Acquire semaphore permit
let _permit = semaphore.acquire().await.expect("Semaphore closed");
// Check shutdown before processing
if shutdown_flag.load(Ordering::SeqCst) {
return;
}
// Call the enrichment function (this is where the type-specific logic happens)
let result = enrichment_fn(company.clone(), Arc::clone(&yahoo_pool), paths).await;
// Update counters
processed_count.fetch_add(1, Ordering::SeqCst);
let status = match result {
Ok(_) => {
success_count.fetch_add(1, Ordering::SeqCst);
"enriched"
}
Err(e) => {
failed_count.fetch_add(1, Ordering::SeqCst);
logger::log_warn(&format!(
" Failed to enrich {}: {}",
company.name, e
)).await;
"failed"
}
};
// Log result
let log_entry = json!({
"company_name": company.name,
"status": status,
"timestamp": Utc::now().to_rfc3339(),
});
let _ = log_tx.send(LogCommand::Write(log_entry)).await;
})
}