cleaned yahoo hits
This commit is contained in:
@@ -66,7 +66,7 @@ pub async fn run_full_update(
|
||||
|
||||
if let Some(date_dir) = date_dir {
|
||||
logger::log_info(&format!(" Using FIGI data from: {:?}", date_dir)).await;
|
||||
build_securities_from_figi_streaming(&date_dir).await?;
|
||||
load_or_build_all_securities(&date_dir).await?;
|
||||
logger::log_info(" ✓ Securities map updated").await;
|
||||
} else {
|
||||
logger::log_warn(" ✗ No FIGI data directory found").await;
|
||||
@@ -88,6 +88,7 @@ pub async fn run_full_update(
|
||||
|
||||
logger::log_info("Step 6: Cleansing up companies with missing essential data...").await;
|
||||
let cleansed_count = companies_yahoo_jsonl(&paths).await?;
|
||||
logger::log_info(&format!(" ✓ {} companies found on Yahoo ready for further use in companies_yahoo.jsonl", cleansed_count)).await;
|
||||
|
||||
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||
logger::log_info("Step 7: Processing events (using index)...").await;
|
||||
@@ -101,20 +102,24 @@ pub async fn run_full_update(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Cleansing function to remove companies with missing essential yahoo data for integrity
|
||||
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' are removed
|
||||
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' and 'YAHOO:ERROR' are removed
|
||||
/// The rest stays unchanged
|
||||
///
|
||||
/// The '.jsonl' will be saved in the same path but 'companies_filtered.jsonl'
|
||||
/// Uses state.jsonl to track completion and avoid re-running the cleansing operation
|
||||
/// The '.jsonl' will be saved in the same path but 'companies_yahoo.jsonl'
|
||||
/// Only execute when 'companies.jsonl' is present
|
||||
pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||
use tokio::fs::File;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
|
||||
use serde_json::json;
|
||||
|
||||
let path = paths.base_dir();
|
||||
let data_path = paths.data_dir();
|
||||
|
||||
let input_path = path.join("corporate").join("companies.jsonl");
|
||||
let output_path = path.join("corporate").join("companies_yahoo.jsonl");
|
||||
let input_path = data_path.join("companies.jsonl");
|
||||
let output_path = data_path.join("companies_yahoo.jsonl");
|
||||
let state_path = data_path.join("state.jsonl");
|
||||
|
||||
// Check if input file exists
|
||||
if !input_path.exists() {
|
||||
@@ -122,6 +127,37 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
// Check if state file exists and cleansing was already completed
|
||||
if state_path.exists() {
|
||||
let state_content = tokio::fs::read_to_string(&state_path).await?;
|
||||
|
||||
for line in state_content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
|
||||
if state.get("yahoo_companies").and_then(|v| v.as_bool()).unwrap_or(false) {
|
||||
logger::log_info(" Yahoo companies cleansing already completed, reading existing file...").await;
|
||||
|
||||
// Count lines in existing output file
|
||||
if output_path.exists() {
|
||||
let output_content = tokio::fs::read_to_string(&output_path).await?;
|
||||
let count = output_content.lines()
|
||||
.filter(|line| !line.trim().is_empty())
|
||||
.count();
|
||||
|
||||
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo.jsonl", count)).await;
|
||||
return Ok(count);
|
||||
} else {
|
||||
logger::log_warn(" State indicates completion but companies_yahoo.jsonl not found, re-running...").await;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
|
||||
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
|
||||
|
||||
@@ -150,11 +186,15 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||
};
|
||||
|
||||
// Check if company has at least one valid YAHOO ticker
|
||||
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS"
|
||||
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS" or "YAHOO:ERROR"
|
||||
let has_valid_yahoo = company.isin_tickers_map
|
||||
.values()
|
||||
.flatten()
|
||||
.any(|ticker| ticker.starts_with("YAHOO:") && ticker != "YAHOO:NO_RESULTS");
|
||||
.any(|ticker| {
|
||||
ticker.starts_with("YAHOO:")
|
||||
&& ticker != "YAHOO:NO_RESULTS"
|
||||
&& ticker != "YAHOO:ERROR"
|
||||
});
|
||||
|
||||
if has_valid_yahoo {
|
||||
// Write the company to the filtered output
|
||||
@@ -183,6 +223,20 @@ pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||
total_count, valid_count, removed_count
|
||||
)).await;
|
||||
|
||||
// Write state file to mark completion
|
||||
let yahoo_companies = json!({
|
||||
"yahoo_companies": true,
|
||||
"completed_at": chrono::Utc::now().to_rfc3339(),
|
||||
});
|
||||
|
||||
let mut state_file = File::create(&state_path).await?;
|
||||
let state_line = serde_json::to_string(&yahoo_companies)?;
|
||||
state_file.write_all(state_line.as_bytes()).await?;
|
||||
state_file.write_all(b"\n").await?;
|
||||
state_file.flush().await?;
|
||||
|
||||
logger::log_info(&format!(" ✓ State file created at: {:?}", state_path)).await;
|
||||
|
||||
Ok(valid_count)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user