öi
This commit is contained in:
@@ -40,14 +40,13 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
|
||||
|
||||
let input_path = data_path.join("companies.jsonl");
|
||||
let output_path = data_path.join("companies_yahoo.jsonl");
|
||||
let state_path = data_path.join("state.jsonl");
|
||||
|
||||
if !input_path.exists() {
|
||||
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let manager = StateManager::new(paths.integrity_dir())?;
|
||||
let manager = StateManager::new(paths.integrity_dir()).await?;
|
||||
let step_name = "yahoo_companies_cleansed_no_data";
|
||||
let content_reference = file_reference(&output_path);
|
||||
|
||||
@@ -171,7 +170,6 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
||||
let input_path = data_path.join("companies_yahoo.jsonl");
|
||||
let checkpoint_path = data_path.join("companies_yahoo_cleaned.jsonl");
|
||||
let log_path = data_path.join("companies_updates.log");
|
||||
let state_path = data_path.join("state.jsonl");
|
||||
|
||||
// Check input exists
|
||||
if !input_path.exists() {
|
||||
@@ -179,7 +177,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let manager = StateManager::new(paths.integrity_dir())?;
|
||||
let manager = StateManager::new(paths.integrity_dir()).await?;
|
||||
let step_name = "yahoo_companies_cleansed_no_data";
|
||||
let content_reference = file_reference(&checkpoint_path);
|
||||
|
||||
@@ -194,35 +192,6 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
||||
}
|
||||
|
||||
logger::log_info(" Cleansing companies with low Yahoo profile...").await;
|
||||
|
||||
if state_path.exists() {
|
||||
let state_content = tokio::fs::read_to_string(&state_path).await?;
|
||||
|
||||
for line in state_content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
|
||||
if state.get("yahoo_companies_cleansed_low_profile").and_then(|v| v.as_bool()).unwrap_or(false) {
|
||||
logger::log_info(" Yahoo low profile cleansing already completed, reading existing file...").await;
|
||||
|
||||
if checkpoint_path.exists() {
|
||||
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
|
||||
let count = checkpoint_content.lines()
|
||||
.filter(|line| !line.trim().is_empty())
|
||||
.count();
|
||||
|
||||
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo_cleaned.jsonl", count)).await;
|
||||
return Ok(count);
|
||||
} else {
|
||||
logger::log_warn(" State indicates completion but companies_yahoo_cleaned.jsonl not found, re-running...").await;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||
|
||||
Reference in New Issue
Block a user