added integrity check to cleanse functions
This commit is contained in:
@@ -3,6 +3,7 @@ use super::{helpers::*, types::*};
|
||||
use crate::config::Config;
|
||||
use crate::corporate::checkpoint_helpers;
|
||||
use crate::util::directories::DataPaths;
|
||||
use crate::util::integrity::{DataStage, StateManager, file_reference};
|
||||
use crate::util::logger;
|
||||
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
|
||||
|
||||
@@ -14,7 +15,6 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use tokio::fs::{File, OpenOptions};
|
||||
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use serde_json::json;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
/// Result of processing a single company
|
||||
@@ -33,11 +33,6 @@ enum LogCommand {
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Result from processing a single company with priority
|
||||
struct CompanyTaskResult {
|
||||
company: CompanyCrossPlatformInfo,
|
||||
result: CompanyProcessResult,
|
||||
}
|
||||
|
||||
/// Cleansing function to remove companies with missing essential yahoo data for integrity
|
||||
pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize, anyhow::Error> {
|
||||
@@ -51,36 +46,23 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
|
||||
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let manager = StateManager::new(&state_path, &data_path.to_path_buf());
|
||||
let step_name = "yahoo_companies_cleansed_no_data";
|
||||
let content_reference = file_reference(&output_path);
|
||||
|
||||
if state_path.exists() {
|
||||
let state_content = tokio::fs::read_to_string(&state_path).await?;
|
||||
if manager.is_step_valid(step_name).await? {
|
||||
let output_content = tokio::fs::read_to_string(&output_path).await?;
|
||||
let count = output_content.lines()
|
||||
.filter(|line| !line.trim().is_empty())
|
||||
.count();
|
||||
|
||||
for line in state_content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
|
||||
if state.get("yahoo_companies_cleansed_no_data").and_then(|v| v.as_bool()).unwrap_or(false) {
|
||||
logger::log_info(" Yahoo companies cleansing already completed, reading existing file...").await;
|
||||
|
||||
if output_path.exists() {
|
||||
let output_content = tokio::fs::read_to_string(&output_path).await?;
|
||||
let count = output_content.lines()
|
||||
.filter(|line| !line.trim().is_empty())
|
||||
.count();
|
||||
|
||||
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo.jsonl", count)).await;
|
||||
return Ok(count);
|
||||
} else {
|
||||
logger::log_warn(" State indicates completion but companies_yahoo.jsonl not found, re-running...").await;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo.jsonl", count)).await;
|
||||
return Ok(count);
|
||||
}
|
||||
|
||||
logger::log_info(" Cleansing companies with missing Yahoo data...").await;
|
||||
|
||||
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
|
||||
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
|
||||
|
||||
@@ -141,18 +123,17 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
|
||||
total_count, valid_count, removed_count
|
||||
)).await;
|
||||
|
||||
let yahoo_companies = json!({
|
||||
"yahoo_companies_cleansed_no_data": true,
|
||||
"completed_at": chrono::Utc::now().to_rfc3339(),
|
||||
});
|
||||
|
||||
let mut state_file = File::create(&state_path).await?;
|
||||
let state_line = serde_json::to_string(&yahoo_companies)?;
|
||||
state_file.write_all(state_line.as_bytes()).await?;
|
||||
state_file.write_all(b"\n").await?;
|
||||
state_file.flush().await?;
|
||||
|
||||
logger::log_info(&format!(" ✓ State file created at: {:?}", state_path)).await;
|
||||
// Track completion with:
|
||||
// - Content reference: All event directories
|
||||
// - Data stage: Data (7-day TTL by default)
|
||||
// - Dependencies: Depends on cleaned companies data
|
||||
manager.update_entry(
|
||||
step_name.to_string(),
|
||||
content_reference,
|
||||
DataStage::Data,
|
||||
vec!["yahoo_companies_cleansed".to_string()], // Dependency
|
||||
None, // Use default TTL (7 days for Data stage)
|
||||
).await?;
|
||||
|
||||
Ok(valid_count)
|
||||
}
|
||||
@@ -199,6 +180,22 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let manager = StateManager::new(&state_path, &data_path.to_path_buf());
|
||||
let step_name = "yahoo_companies_cleansed_no_data";
|
||||
let content_reference = file_reference(&checkpoint_path);
|
||||
|
||||
if manager.is_step_valid(step_name).await? {
|
||||
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
|
||||
let count = checkpoint_content.lines()
|
||||
.filter(|line| !line.trim().is_empty())
|
||||
.count();
|
||||
|
||||
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo_cleaned.jsonl", count)).await;
|
||||
return Ok(count);
|
||||
}
|
||||
|
||||
logger::log_info(" Cleansing companies with low Yahoo profile...").await;
|
||||
|
||||
if state_path.exists() {
|
||||
let state_content = tokio::fs::read_to_string(&state_path).await?;
|
||||
|
||||
@@ -505,7 +502,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
||||
}
|
||||
|
||||
match task_result {
|
||||
Ok(Ok(Some(_result))) => {
|
||||
Ok(Ok(_)) => {
|
||||
// Success - spawn next task
|
||||
if let Some(company) = pending.pop() {
|
||||
spawn_validation_task(
|
||||
@@ -524,25 +521,6 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(Ok(None)) => {
|
||||
// Filtered or failed - spawn next task
|
||||
if let Some(company) = pending.pop() {
|
||||
spawn_validation_task(
|
||||
company,
|
||||
&yahoo_pool,
|
||||
&paths,
|
||||
&write_tx,
|
||||
shutdown_flag,
|
||||
&processed,
|
||||
&valid_count,
|
||||
&filtered_low_cap,
|
||||
&filtered_no_price,
|
||||
&failed_count,
|
||||
total,
|
||||
&mut tasks,
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
// Processing error
|
||||
logger::log_error(&format!("Company processing error: {}", e)).await;
|
||||
@@ -646,24 +624,17 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
||||
// Shutdown Yahoo pool
|
||||
yahoo_pool.shutdown().await?;
|
||||
|
||||
// Write completion milestone to state.jsonl
|
||||
let state_path = data_path.join("state.jsonl");
|
||||
let yahoo_low_profile = json!({
|
||||
"yahoo_companies_cleansed_low_profile": true,
|
||||
"completed_at": chrono::Utc::now().to_rfc3339(),
|
||||
});
|
||||
|
||||
let mut state_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&state_path)
|
||||
.await?;
|
||||
let state_line = serde_json::to_string(&yahoo_low_profile)?;
|
||||
state_file.write_all(state_line.as_bytes()).await?;
|
||||
state_file.write_all(b"\n").await?;
|
||||
state_file.flush().await?;
|
||||
|
||||
logger::log_info(&format!(" ✓ State milestone saved to: {:?}", state_path)).await;
|
||||
// Track completion with:
|
||||
// - Content reference: All event directories
|
||||
// - Data stage: Data (7-day TTL by default)
|
||||
// - Dependencies: Depends on cleaned companies data
|
||||
manager.update_entry(
|
||||
step_name.to_string(),
|
||||
content_reference,
|
||||
DataStage::Data,
|
||||
vec!["yahoo_companies_cleansed".to_string()], // Dependency
|
||||
None, // Use default TTL (7 days for Data stage)
|
||||
).await?;
|
||||
|
||||
Ok(final_count)
|
||||
}
|
||||
@@ -681,7 +652,7 @@ fn spawn_validation_task(
|
||||
filtered_no_price: &Arc<AtomicUsize>,
|
||||
failed_count: &Arc<AtomicUsize>,
|
||||
total: usize,
|
||||
tasks: &mut FuturesUnordered<tokio::task::JoinHandle<anyhow::Result<Option<CompanyTaskResult>>>>,
|
||||
tasks: &mut FuturesUnordered<tokio::task::JoinHandle<anyhow::Result<Option<()>>>>,
|
||||
) {
|
||||
let yahoo_pool_clone = Arc::clone(yahoo_pool);
|
||||
let paths_clone = Arc::clone(paths);
|
||||
@@ -705,36 +676,29 @@ fn spawn_validation_task(
|
||||
&*paths_clone,
|
||||
).await;
|
||||
|
||||
let task_result = match result {
|
||||
match result {
|
||||
CompanyProcessResult::Valid(validated_company) => {
|
||||
// Send to writer
|
||||
let _ = write_tx_clone.send(LogCommand::Write(validated_company.clone())).await;
|
||||
let _ = write_tx_clone.send(LogCommand::Write(validated_company)).await;
|
||||
valid_count_clone.fetch_add(1, Ordering::SeqCst);
|
||||
Some(CompanyTaskResult {
|
||||
company: validated_company.clone(),
|
||||
result: CompanyProcessResult::Valid(validated_company),
|
||||
})
|
||||
}
|
||||
CompanyProcessResult::FilteredLowCap { name, market_cap } => {
|
||||
filtered_low_cap_clone.fetch_add(1, Ordering::SeqCst);
|
||||
if filtered_low_cap_clone.load(Ordering::SeqCst) <= 10 {
|
||||
logger::log_info(&format!(" Filtered {} - low market cap: {:.0} EUR", name, market_cap)).await;
|
||||
}
|
||||
None
|
||||
}
|
||||
CompanyProcessResult::FilteredNoPrice { name } => {
|
||||
filtered_no_price_clone.fetch_add(1, Ordering::SeqCst);
|
||||
if filtered_no_price_clone.load(Ordering::SeqCst) <= 10 {
|
||||
logger::log_info(&format!(" Filtered {} - no recent price data", name)).await;
|
||||
}
|
||||
None
|
||||
}
|
||||
CompanyProcessResult::Failed { company: failed_company, error, is_transient: _ } => {
|
||||
failed_count_clone.fetch_add(1, Ordering::SeqCst);
|
||||
logger::log_warn(&format!(" Failed to process '{}': {}", failed_company.name, error)).await;
|
||||
None
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Progress reporting
|
||||
let current = processed_clone.fetch_add(1, Ordering::SeqCst) + 1;
|
||||
@@ -749,7 +713,7 @@ fn spawn_validation_task(
|
||||
)).await;
|
||||
}
|
||||
|
||||
Ok(task_result)
|
||||
Ok(None::<()>)
|
||||
});
|
||||
|
||||
tasks.push(task);
|
||||
|
||||
Reference in New Issue
Block a user