added working hard reset
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
// src/corporate/update.rs - UPDATED WITH DATA INTEGRITY FIXES
|
||||
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
|
||||
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*};
|
||||
use crate::config::Config;
|
||||
use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel;
|
||||
use crate::util::directories::DataPaths;
|
||||
@@ -11,7 +11,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
/// UPDATED: Main corporate update entry point with shutdown awareness
|
||||
/// Main corporate update entry point with shutdown awareness
|
||||
pub async fn run_full_update(
|
||||
_config: &Config,
|
||||
pool: &Arc<ChromeDriverPool>,
|
||||
@@ -81,8 +81,16 @@ pub async fn run_full_update(
|
||||
let count = build_companies_jsonl_streaming_parallel(&paths, pool, shutdown_flag, _config, &None).await?;
|
||||
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
||||
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
logger::log_warn("Shutdown detected after companies.jsonl build").await;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
logger::log_info("Step 6: Cleansing up companies with missing essential data...").await;
|
||||
let cleansed_count = companies_yahoo_jsonl(&paths).await?;
|
||||
|
||||
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||
logger::log_info("Step 6: Processing events (using index)...").await;
|
||||
logger::log_info("Step 7: Processing events (using index)...").await;
|
||||
let _event_index = build_event_index(&paths).await?;
|
||||
logger::log_info(" ✓ Event index built").await;
|
||||
} else {
|
||||
@@ -93,6 +101,91 @@ pub async fn run_full_update(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Cleansing function to remove companies with missing essential yahoo data for integrity
|
||||
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' are removed
|
||||
/// The rest stays unchanged
|
||||
///
|
||||
/// The '.jsonl' will be saved in the same path but 'companies_filtered.jsonl'
|
||||
/// Only execute when 'companies.jsonl' is present
|
||||
pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||
use tokio::fs::File;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
|
||||
|
||||
let path = paths.base_dir();
|
||||
|
||||
let input_path = path.join("corporate").join("companies.jsonl");
|
||||
let output_path = path.join("corporate").join("companies_yahoo.jsonl");
|
||||
|
||||
// Check if input file exists
|
||||
if !input_path.exists() {
|
||||
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
|
||||
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
|
||||
|
||||
let file = File::open(&input_path).await?;
|
||||
let reader = BufReader::new(file);
|
||||
let mut lines = reader.lines();
|
||||
|
||||
let mut output_file = File::create(&output_path).await?;
|
||||
let mut valid_count = 0;
|
||||
let mut removed_count = 0;
|
||||
let mut total_count = 0;
|
||||
|
||||
while let Some(line) = lines.next_line().await? {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
total_count += 1;
|
||||
|
||||
let company: CompanyCrossPlatformInfo = match serde_json::from_str(&line) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// Check if company has at least one valid YAHOO ticker
|
||||
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS"
|
||||
let has_valid_yahoo = company.isin_tickers_map
|
||||
.values()
|
||||
.flatten()
|
||||
.any(|ticker| ticker.starts_with("YAHOO:") && ticker != "YAHOO:NO_RESULTS");
|
||||
|
||||
if has_valid_yahoo {
|
||||
// Write the company to the filtered output
|
||||
let json_line = serde_json::to_string(&company)?;
|
||||
output_file.write_all(json_line.as_bytes()).await?;
|
||||
output_file.write_all(b"\n").await?;
|
||||
valid_count += 1;
|
||||
} else {
|
||||
removed_count += 1;
|
||||
if removed_count <= 5 {
|
||||
// Log first few removals for debugging
|
||||
logger::log_info(&format!(" Removed company '{}' (no valid Yahoo ticker)", company.name)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress indicator for large files
|
||||
if total_count % 1000 == 0 {
|
||||
logger::log_info(&format!(" Processed {} companies...", total_count)).await;
|
||||
}
|
||||
}
|
||||
|
||||
output_file.flush().await?;
|
||||
|
||||
logger::log_info(&format!(
|
||||
" ✓ Cleansing complete: {} total → {} valid, {} removed",
|
||||
total_count, valid_count, removed_count
|
||||
)).await;
|
||||
|
||||
Ok(valid_count)
|
||||
}
|
||||
|
||||
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
||||
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user