added working hard reset

This commit is contained in:
2025-12-23 15:07:40 +01:00
parent fb0876309f
commit f9f09d0291
5 changed files with 666 additions and 127 deletions

View File

@@ -1,5 +1,5 @@
// src/corporate/update.rs - UPDATED WITH DATA INTEGRITY FIXES
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*};
use crate::config::Config;
use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel;
use crate::util::directories::DataPaths;
@@ -11,7 +11,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
/// UPDATED: Main corporate update entry point with shutdown awareness
/// Main corporate update entry point with shutdown awareness
pub async fn run_full_update(
_config: &Config,
pool: &Arc<ChromeDriverPool>,
@@ -81,8 +81,16 @@ pub async fn run_full_update(
let count = build_companies_jsonl_streaming_parallel(&paths, pool, shutdown_flag, _config, &None).await?;
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected after companies.jsonl build").await;
return Ok(());
}
logger::log_info("Step 6: Cleansing up companies with missing essential data...").await;
let cleansed_count = companies_yahoo_jsonl(&paths).await?;
if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 6: Processing events (using index)...").await;
logger::log_info("Step 7: Processing events (using index)...").await;
let _event_index = build_event_index(&paths).await?;
logger::log_info(" ✓ Event index built").await;
} else {
@@ -93,6 +101,91 @@ pub async fn run_full_update(
Ok(())
}
/// Cleansing function to remove companies with missing essential yahoo data for integrity
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' are removed
/// The rest stays unchanged
///
/// The '.jsonl' will be saved in the same path but 'companies_filtered.jsonl'
/// Only execute when 'companies.jsonl' is present
pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
use tokio::fs::File;
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
let path = paths.base_dir();
let input_path = path.join("corporate").join("companies.jsonl");
let output_path = path.join("corporate").join("companies_yahoo.jsonl");
// Check if input file exists
if !input_path.exists() {
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
return Ok(0);
}
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
let file = File::open(&input_path).await?;
let reader = BufReader::new(file);
let mut lines = reader.lines();
let mut output_file = File::create(&output_path).await?;
let mut valid_count = 0;
let mut removed_count = 0;
let mut total_count = 0;
while let Some(line) = lines.next_line().await? {
if line.trim().is_empty() {
continue;
}
total_count += 1;
let company: CompanyCrossPlatformInfo = match serde_json::from_str(&line) {
Ok(c) => c,
Err(e) => {
logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await;
continue;
}
};
// Check if company has at least one valid YAHOO ticker
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS"
let has_valid_yahoo = company.isin_tickers_map
.values()
.flatten()
.any(|ticker| ticker.starts_with("YAHOO:") && ticker != "YAHOO:NO_RESULTS");
if has_valid_yahoo {
// Write the company to the filtered output
let json_line = serde_json::to_string(&company)?;
output_file.write_all(json_line.as_bytes()).await?;
output_file.write_all(b"\n").await?;
valid_count += 1;
} else {
removed_count += 1;
if removed_count <= 5 {
// Log first few removals for debugging
logger::log_info(&format!(" Removed company '{}' (no valid Yahoo ticker)", company.name)).await;
}
}
// Progress indicator for large files
if total_count % 1000 == 0 {
logger::log_info(&format!(" Processed {} companies...", total_count)).await;
}
}
output_file.flush().await?;
logger::log_info(&format!(
" ✓ Cleansing complete: {} total → {} valid, {} removed",
total_count, valid_count, removed_count
)).await;
Ok(valid_count)
}
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();