This commit is contained in:
2026-01-12 01:01:19 +01:00
parent bd74f36f4c
commit 659757482d
13 changed files with 526 additions and 93 deletions

View File

@@ -40,14 +40,13 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
let input_path = data_path.join("companies.jsonl");
let output_path = data_path.join("companies_yahoo.jsonl");
let state_path = data_path.join("state.jsonl");
if !input_path.exists() {
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
return Ok(0);
}
let manager = StateManager::new(paths.integrity_dir())?;
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_companies_cleansed_no_data";
let content_reference = file_reference(&output_path);
@@ -171,7 +170,6 @@ pub async fn companies_yahoo_cleansed_low_profile(
let input_path = data_path.join("companies_yahoo.jsonl");
let checkpoint_path = data_path.join("companies_yahoo_cleaned.jsonl");
let log_path = data_path.join("companies_updates.log");
let state_path = data_path.join("state.jsonl");
// Check input exists
if !input_path.exists() {
@@ -179,7 +177,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
return Ok(0);
}
let manager = StateManager::new(paths.integrity_dir())?;
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_companies_cleansed_no_data";
let content_reference = file_reference(&checkpoint_path);
@@ -194,35 +192,6 @@ pub async fn companies_yahoo_cleansed_low_profile(
}
logger::log_info(" Cleansing companies with low Yahoo profile...").await;
if state_path.exists() {
let state_content = tokio::fs::read_to_string(&state_path).await?;
for line in state_content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
if state.get("yahoo_companies_cleansed_low_profile").and_then(|v| v.as_bool()).unwrap_or(false) {
logger::log_info(" Yahoo low profile cleansing already completed, reading existing file...").await;
if checkpoint_path.exists() {
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
let count = checkpoint_content.lines()
.filter(|line| !line.trim().is_empty())
.count();
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo_cleaned.jsonl", count)).await;
return Ok(count);
} else {
logger::log_warn(" State indicates completion but companies_yahoo_cleaned.jsonl not found, re-running...").await;
break;
}
}
}
}
}
// === RECOVERY PHASE: Load checkpoint + replay log ===
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();