cleaned yahoo hits

This commit is contained in:
2025-12-24 00:00:21 +01:00
parent f9f09d0291
commit 86944a9c58
4 changed files with 829 additions and 217 deletions

View File

@@ -66,7 +66,7 @@ pub async fn run_full_update(
if let Some(date_dir) = date_dir {
logger::log_info(&format!(" Using FIGI data from: {:?}", date_dir)).await;
build_securities_from_figi_streaming(&date_dir).await?;
load_or_build_all_securities(&date_dir).await?;
logger::log_info(" ✓ Securities map updated").await;
} else {
logger::log_warn(" ✗ No FIGI data directory found").await;
@@ -88,6 +88,7 @@ pub async fn run_full_update(
logger::log_info("Step 6: Cleansing up companies with missing essential data...").await;
let cleansed_count = companies_yahoo_jsonl(&paths).await?;
logger::log_info(&format!("{} companies found on Yahoo ready for further use in companies_yahoo.jsonl", cleansed_count)).await;
if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 7: Processing events (using index)...").await;
@@ -101,20 +102,24 @@ pub async fn run_full_update(
Ok(())
}
/// Cleansing function to remove companies with missing essential yahoo data for integrity
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' are removed
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' and 'YAHOO:ERROR' are removed
/// The rest stays unchanged
///
/// The '.jsonl' will be saved in the same path but 'companies_filtered.jsonl'
/// Uses state.jsonl to track completion and avoid re-running the cleansing operation
/// The '.jsonl' will be saved in the same path but 'companies_yahoo.jsonl'
/// Only execute when 'companies.jsonl' is present
pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
use tokio::fs::File;
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use serde_json::json;
let path = paths.base_dir();
let data_path = paths.data_dir();
let input_path = path.join("corporate").join("companies.jsonl");
let output_path = path.join("corporate").join("companies_yahoo.jsonl");
let input_path = data_path.join("companies.jsonl");
let output_path = data_path.join("companies_yahoo.jsonl");
let state_path = data_path.join("state.jsonl");
// Check if input file exists
if !input_path.exists() {
@@ -122,6 +127,37 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
return Ok(0);
}
// Check if state file exists and cleansing was already completed
if state_path.exists() {
let state_content = tokio::fs::read_to_string(&state_path).await?;
for line in state_content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
if state.get("yahoo_companies").and_then(|v| v.as_bool()).unwrap_or(false) {
logger::log_info(" Yahoo companies cleansing already completed, reading existing file...").await;
// Count lines in existing output file
if output_path.exists() {
let output_content = tokio::fs::read_to_string(&output_path).await?;
let count = output_content.lines()
.filter(|line| !line.trim().is_empty())
.count();
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo.jsonl", count)).await;
return Ok(count);
} else {
logger::log_warn(" State indicates completion but companies_yahoo.jsonl not found, re-running...").await;
break;
}
}
}
}
}
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
@@ -150,11 +186,15 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
};
// Check if company has at least one valid YAHOO ticker
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS"
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS" or "YAHOO:ERROR"
let has_valid_yahoo = company.isin_tickers_map
.values()
.flatten()
.any(|ticker| ticker.starts_with("YAHOO:") && ticker != "YAHOO:NO_RESULTS");
.any(|ticker| {
ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
});
if has_valid_yahoo {
// Write the company to the filtered output
@@ -183,6 +223,20 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
total_count, valid_count, removed_count
)).await;
// Write state file to mark completion
let yahoo_companies = json!({
"yahoo_companies": true,
"completed_at": chrono::Utc::now().to_rfc3339(),
});
let mut state_file = File::create(&state_path).await?;
let state_line = serde_json::to_string(&yahoo_companies)?;
state_file.write_all(state_line.as_bytes()).await?;
state_file.write_all(b"\n").await?;
state_file.flush().await?;
logger::log_info(&format!(" ✓ State file created at: {:?}", state_path)).await;
Ok(valid_count)
}