added integrity check to enrichment functions

This commit is contained in:
2026-01-10 17:40:16 +01:00
parent 151c96e35f
commit 766eb803f1
9 changed files with 942 additions and 105 deletions

View File

@@ -1,8 +1,9 @@
// src/corporate/update_companies_enrich_events.rs
// src/corporate/update_companies_enrich_events.rs - WITH INTEGRITY MODULE
use super::{types::*, helpers::*};
use crate::config::Config;
use crate::corporate::checkpoint_helpers;
use crate::util::directories::DataPaths;
use crate::util::integrity::{StateManager, directory_reference, DataStage};
use crate::util::logger;
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
@@ -25,7 +26,7 @@ use tokio::sync::mpsc;
/// - Crash-safe persistence (checkpoint + log with fsync)
/// - Smart skip logic (only process incomplete data)
/// - Uses pending queue instead of retry mechanism
/// - Reuses companies_update.log for persistence
/// - Content integrity validation with hash tracking
///
/// # Persistence Strategy
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
@@ -33,6 +34,7 @@ use tokio::sync::mpsc;
/// - On restart: Load checkpoint + replay log
/// - Periodic checkpoints (every 50 companies)
/// - Batched fsync (every 10 writes or 10 seconds)
/// - Hash validation of all event data directories
pub async fn enrich_companies_with_events(
paths: &DataPaths,
_config: &Config,
@@ -43,7 +45,7 @@ pub async fn enrich_companies_with_events(
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 50; // Limit parallel enrichment tasks
const CONCURRENCY_LIMIT: usize = 50;
let data_path = paths.data_dir();
@@ -57,29 +59,21 @@ pub async fn enrich_companies_with_events(
logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping event enrichment").await;
return Ok(0);
}
// Check if already completed
if state_path.exists() {
let state_content = tokio::fs::read_to_string(&state_path).await?;
let manager = StateManager::new(&state_path, &data_path.to_path_buf());
let step_name = "yahoo_events_enrichment_complete";
if manager.is_step_valid(step_name).await? {
logger::log_info(" Yahoo events enrichment already completed and valid").await;
for line in state_content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
if state.get("yahoo_events_enrichment_complete").and_then(|v| v.as_bool()).unwrap_or(false) {
logger::log_info(" Yahoo events enrichment already completed").await;
// Count enriched companies
let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?;
logger::log_info(&format!(" ✓ Found {} companies with event data", count)).await;
return Ok(count);
}
}
}
// Count enriched companies
let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?;
logger::log_info(&format!(" ✓ Found {} companies with valid event data", count)).await;
return Ok(count);
}
logger::log_info(" Event data needs refresh - starting enrichment").await;
// === RECOVERY PHASE: Track enriched companies ===
let enriched_companies: HashSet<String> = checkpoint_helpers::load_enrichment_progress(&log_path).await?;
@@ -104,7 +98,9 @@ pub async fn enrich_companies_with_events(
if pending_count == 0 {
logger::log_info(" ✓ All companies already enriched").await;
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
track_events_completion(&manager, paths, step_name).await?;
return Ok(enriched_companies.len());
}
@@ -263,13 +259,49 @@ pub async fn enrich_companies_with_events(
// Mark as complete if all companies processed
if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) {
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
logger::log_info(" ✓ Event enrichment marked as complete").await;
track_events_completion(&manager, paths, step_name).await?;
logger::log_info(" ✓ Event enrichment marked as complete with integrity tracking").await;
}
Ok(final_success)
}
/// Track event enrichment completion with content hash verification
async fn track_events_completion(
manager: &StateManager,
paths: &DataPaths,
step_name: &str,
) -> anyhow::Result<()> {
// Create content reference for all event data
// This will hash ALL files matching the pattern: {company}/events/data.jsonl
let content_reference = directory_reference(
paths.corporate_dir(),
Some(vec![
"*/events/*.jsonl".to_string(), // Main pattern for events data
"*/events/data.jsonl".to_string(), // Specific pattern (more precise)
]),
Some(vec![
"*.log".to_string(), // Exclude log files
"*.tmp".to_string(), // Exclude temp files
"*.bak".to_string(), // Exclude backup files
]),
);
// Track completion with:
// - Content reference: All event directories
// - Data stage: Data (7-day TTL by default)
// - Dependencies: Depends on cleaned companies data
manager.update_entry(
step_name.to_string(),
content_reference,
DataStage::Data,
vec!["yahoo_companies_cleansed".to_string()], // Dependency
None, // Use default TTL (7 days for Data stage)
).await?;
Ok(())
}
/// Spawn a single enrichment task with panic isolation
fn spawn_enrichment_task(
company: CompanyCrossPlatformInfo,