added integrity check to enrichment functions
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
// src/corporate/update_companies_enrich_events.rs
|
||||
// src/corporate/update_companies_enrich_events.rs - WITH INTEGRITY MODULE
|
||||
use super::{types::*, helpers::*};
|
||||
use crate::config::Config;
|
||||
use crate::corporate::checkpoint_helpers;
|
||||
use crate::util::directories::DataPaths;
|
||||
use crate::util::integrity::{StateManager, directory_reference, DataStage};
|
||||
use crate::util::logger;
|
||||
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
|
||||
|
||||
@@ -25,7 +26,7 @@ use tokio::sync::mpsc;
|
||||
/// - Crash-safe persistence (checkpoint + log with fsync)
|
||||
/// - Smart skip logic (only process incomplete data)
|
||||
/// - Uses pending queue instead of retry mechanism
|
||||
/// - Reuses companies_update.log for persistence
|
||||
/// - Content integrity validation with hash tracking
|
||||
///
|
||||
/// # Persistence Strategy
|
||||
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
|
||||
@@ -33,6 +34,7 @@ use tokio::sync::mpsc;
|
||||
/// - On restart: Load checkpoint + replay log
|
||||
/// - Periodic checkpoints (every 50 companies)
|
||||
/// - Batched fsync (every 10 writes or 10 seconds)
|
||||
/// - Hash validation of all event data directories
|
||||
pub async fn enrich_companies_with_events(
|
||||
paths: &DataPaths,
|
||||
_config: &Config,
|
||||
@@ -43,7 +45,7 @@ pub async fn enrich_companies_with_events(
|
||||
const CHECKPOINT_INTERVAL: usize = 50;
|
||||
const FSYNC_BATCH_SIZE: usize = 10;
|
||||
const FSYNC_INTERVAL_SECS: u64 = 10;
|
||||
const CONCURRENCY_LIMIT: usize = 50; // Limit parallel enrichment tasks
|
||||
const CONCURRENCY_LIMIT: usize = 50;
|
||||
|
||||
let data_path = paths.data_dir();
|
||||
|
||||
@@ -57,29 +59,21 @@ pub async fn enrich_companies_with_events(
|
||||
logger::log_warn(" companies_yahoo_cleaned.jsonl not found, skipping event enrichment").await;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
// Check if already completed
|
||||
if state_path.exists() {
|
||||
let state_content = tokio::fs::read_to_string(&state_path).await?;
|
||||
|
||||
let manager = StateManager::new(&state_path, &data_path.to_path_buf());
|
||||
let step_name = "yahoo_events_enrichment_complete";
|
||||
|
||||
if manager.is_step_valid(step_name).await? {
|
||||
logger::log_info(" Yahoo events enrichment already completed and valid").await;
|
||||
|
||||
for line in state_content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
|
||||
if state.get("yahoo_events_enrichment_complete").and_then(|v| v.as_bool()).unwrap_or(false) {
|
||||
logger::log_info(" Yahoo events enrichment already completed").await;
|
||||
|
||||
// Count enriched companies
|
||||
let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?;
|
||||
logger::log_info(&format!(" ✓ Found {} companies with event data", count)).await;
|
||||
return Ok(count);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Count enriched companies
|
||||
let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?;
|
||||
logger::log_info(&format!(" ✓ Found {} companies with valid event data", count)).await;
|
||||
return Ok(count);
|
||||
}
|
||||
|
||||
logger::log_info(" Event data needs refresh - starting enrichment").await;
|
||||
|
||||
// === RECOVERY PHASE: Track enriched companies ===
|
||||
let enriched_companies: HashSet<String> = checkpoint_helpers::load_enrichment_progress(&log_path).await?;
|
||||
|
||||
@@ -104,7 +98,9 @@ pub async fn enrich_companies_with_events(
|
||||
|
||||
if pending_count == 0 {
|
||||
logger::log_info(" ✓ All companies already enriched").await;
|
||||
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
|
||||
|
||||
track_events_completion(&manager, paths, step_name).await?;
|
||||
|
||||
return Ok(enriched_companies.len());
|
||||
}
|
||||
|
||||
@@ -263,13 +259,49 @@ pub async fn enrich_companies_with_events(
|
||||
|
||||
// Mark as complete if all companies processed
|
||||
if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) {
|
||||
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
|
||||
logger::log_info(" ✓ Event enrichment marked as complete").await;
|
||||
track_events_completion(&manager, paths, step_name).await?;
|
||||
logger::log_info(" ✓ Event enrichment marked as complete with integrity tracking").await;
|
||||
}
|
||||
|
||||
Ok(final_success)
|
||||
}
|
||||
|
||||
/// Track event enrichment completion with content hash verification
|
||||
async fn track_events_completion(
|
||||
manager: &StateManager,
|
||||
paths: &DataPaths,
|
||||
step_name: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
// Create content reference for all event data
|
||||
// This will hash ALL files matching the pattern: {company}/events/data.jsonl
|
||||
let content_reference = directory_reference(
|
||||
paths.corporate_dir(),
|
||||
Some(vec![
|
||||
"*/events/*.jsonl".to_string(), // Main pattern for events data
|
||||
"*/events/data.jsonl".to_string(), // Specific pattern (more precise)
|
||||
]),
|
||||
Some(vec![
|
||||
"*.log".to_string(), // Exclude log files
|
||||
"*.tmp".to_string(), // Exclude temp files
|
||||
"*.bak".to_string(), // Exclude backup files
|
||||
]),
|
||||
);
|
||||
|
||||
// Track completion with:
|
||||
// - Content reference: All event directories
|
||||
// - Data stage: Data (7-day TTL by default)
|
||||
// - Dependencies: Depends on cleaned companies data
|
||||
manager.update_entry(
|
||||
step_name.to_string(),
|
||||
content_reference,
|
||||
DataStage::Data,
|
||||
vec!["yahoo_companies_cleansed".to_string()], // Dependency
|
||||
None, // Use default TTL (7 days for Data stage)
|
||||
).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn a single enrichment task with panic isolation
|
||||
fn spawn_enrichment_task(
|
||||
company: CompanyCrossPlatformInfo,
|
||||
|
||||
Reference in New Issue
Block a user