// src/corporate/storage.rs use super::{types::*, helpers::*}; use crate::util::directories::DataPaths; use crate::util::logger; use tokio::fs; use tokio::io::AsyncWriteExt; use chrono::{Datelike, NaiveDate}; use std::collections::{HashMap}; use std::path::{PathBuf, Path}; const BATCH_SIZE: usize = 500; // Process 500 events at a time /// Load events in streaming fashion to avoid memory buildup pub async fn load_existing_events_streaming( paths: &DataPaths, callback: impl Fn(CompanyEvent) -> anyhow::Result<()> ) -> anyhow::Result { let dir = paths.corporate_events_dir(); if !dir.exists() { logger::log_info("Corporate Storage: No existing events directory found").await; return Ok(0); } let mut total = 0; let mut entries = fs::read_dir(dir).await?; while let Some(entry) = entries.next_entry().await? { let path = entry.path(); if path.extension().and_then(|s| s.to_str()) == Some("json") { let name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); if name.starts_with("events_") && name.len() == 17 { let content = fs::read_to_string(&path).await?; let events: Vec = serde_json::from_str(&content)?; for event in events { callback(event)?; total += 1; } // Yield to prevent blocking tokio::task::yield_now().await; } } } logger::log_info(&format!("Corporate Storage: Streamed {} events", total)).await; Ok(total) } /// Build lightweight index of events instead of loading everything #[derive(Debug, Clone)] pub struct EventIndex { pub key: String, pub ticker: String, pub date: String, pub file_path: PathBuf, } pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result> { let dir = paths.corporate_events_dir(); if !dir.exists() { return Ok(Vec::new()); } let mut index = Vec::new(); let mut entries = fs::read_dir(dir).await?; while let Some(entry) = entries.next_entry().await? { let path = entry.path(); if path.extension().and_then(|s| s.to_str()) == Some("json") { let name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); if name.starts_with("events_") && name.len() == 17 { let content = fs::read_to_string(&path).await?; let events: Vec = serde_json::from_str(&content)?; for event in events { index.push(EventIndex { key: event_key(&event), ticker: event.ticker.clone(), date: event.date.clone(), file_path: path.clone(), }); } } } } logger::log_info(&format!("Corporate Storage: Built index with {} entries", index.len())).await; Ok(index) } /// Lookup specific event by loading only its file pub async fn lookup_event_by_key( key: &str, index: &[EventIndex] ) -> anyhow::Result> { let entry = index.iter().find(|e| e.key == key); if let Some(entry) = entry { let content = fs::read_to_string(&entry.file_path).await?; let events: Vec = serde_json::from_str(&content)?; Ok(events.into_iter().find(|e| event_key(e) == key)) } else { Ok(None) } } pub async fn save_optimized_events( paths: &DataPaths, events: Vec // Changed from HashMap to Vec ) -> anyhow::Result<()> { let dir = paths.corporate_events_dir(); fs::create_dir_all(dir).await?; logger::log_info("Corporate Storage: Removing old event files...").await; let mut removed_count = 0; let mut entries = fs::read_dir(dir).await?; while let Some(entry) = entries.next_entry().await? { let path = entry.path(); let name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) { fs::remove_file(&path).await?; removed_count += 1; } } logger::log_info(&format!("Corporate Storage: Removed {} old event files", removed_count)).await; let total_events = events.len(); let mut sorted = events; sorted.sort_by(|a, b| { a.ticker.cmp(&b.ticker) .then(a.date.cmp(&b.date)) }); // Process in batches to avoid memory buildup let mut by_month: HashMap> = HashMap::new(); for chunk in sorted.chunks(BATCH_SIZE) { for e in chunk { if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") { let key = format!("{}-{:02}", d.year(), d.month()); by_month.entry(key).or_default().push(e.clone()); } } tokio::task::yield_now().await; } let total_months = by_month.len(); for (month, list) in by_month { let path = dir.join(format!("events_{}.json", month)); fs::write(&path, serde_json::to_string_pretty(&list)?).await?; logger::log_info(&format!("Corporate Storage: Saved {} events for month {}", list.len(), month)).await; } logger::log_info(&format!("Corporate Storage: Saved {} total events in {} month files", total_events, total_months)).await; Ok(()) } pub async fn save_changes(paths: &DataPaths, changes: &[CompanyEventChange]) -> anyhow::Result<()> { if changes.is_empty() { logger::log_info("Corporate Storage: No changes to save").await; return Ok(()); } let dir = paths.corporate_changes_dir(); fs::create_dir_all(dir).await?; logger::log_info(&format!("Corporate Storage: Saving {} changes", changes.len())).await; let mut by_month: HashMap> = HashMap::new(); for c in changes { if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") { let key = format!("{}-{:02}", d.year(), d.month()); by_month.entry(key).or_default().push(c.clone()); } } for (month, list) in by_month { let path = dir.join(format!("changes_{}.json", month)); let mut all = if path.exists() { let s = fs::read_to_string(&path).await?; serde_json::from_str(&s).unwrap_or_default() } else { vec![] }; all.extend(list.clone()); fs::write(&path, serde_json::to_string_pretty(&all)?).await?; logger::log_info(&format!("Corporate Storage: Saved {} changes for month {}", list.len(), month)).await; } logger::log_info("Corporate Storage: All changes saved successfully").await; Ok(()) } pub async fn save_prices_for_ticker( paths: &DataPaths, ticker: &str, timeframe: &str, mut prices: Vec ) -> anyhow::Result<()> { let base_dir = paths.corporate_prices_dir(); let company_dir = base_dir.join(ticker.replace(".", "_")); let timeframe_dir = company_dir.join(timeframe); fs::create_dir_all(&timeframe_dir).await?; let path = timeframe_dir.join("prices.json"); prices.sort_by_key(|p| (p.date.clone(), p.time.clone())); let json = serde_json::to_string_pretty(&prices)?; fs::write(&path, json).await?; Ok(()) } pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf { paths.corporate_prices_dir().join(lei) } pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Result<()> { let base = get_company_dir(paths, isin); let paths_to_create = [ base.clone(), base.join("5min"), base.join("daily"), base.join("aggregated").join("5min"), base.join("aggregated").join("daily"), ]; for p in paths_to_create { fs::create_dir_all(&p).await?; } Ok(()) } pub async fn save_available_exchanges( paths: &DataPaths, isin: &str, exchanges: Vec ) -> anyhow::Result<()> { let dir = get_company_dir(paths, isin); fs::create_dir_all(&dir).await?; let path = dir.join("available_exchanges.json"); fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?; Ok(()) } pub async fn load_available_exchanges(paths: &DataPaths, lei: &str) -> anyhow::Result> { let path = get_company_dir(paths, lei).join("available_exchanges.json"); if path.exists() { let content = fs::read_to_string(&path).await?; Ok(serde_json::from_str(&content)?) } else { Ok(vec![]) } } pub async fn save_prices_by_source( paths: &DataPaths, lei: &str, source_ticker: &str, timeframe: &str, prices: Vec, ) -> anyhow::Result<()> { let source_safe = source_ticker.replace(".", "_").replace("/", "_"); let dir = get_company_dir(paths, lei).join(timeframe).join(&source_safe); fs::create_dir_all(&dir).await?; let path = dir.join("prices.json"); let mut prices = prices; prices.sort_by_key(|p| (p.date.clone(), p.time.clone())); fs::write(&path, serde_json::to_string_pretty(&prices)?).await?; Ok(()) } /// Saves companies data to a JSONL file in streaming fashion pub async fn save_companies_to_jsonl_streaming( paths: &DataPaths, companies: &HashMap>, ) -> anyhow::Result<()> { let file_path = paths.data_dir().join("companies.jsonl"); logger::log_info(&format!("Corporate Storage: Saving {} companies to JSONL", companies.len())).await; if let Some(parent) = file_path.parent() { tokio::fs::create_dir_all(parent).await?; } let mut file = tokio::fs::File::create(&file_path).await?; let mut count = 0; // Process in batches for (name, securities) in companies.iter() { let line = serde_json::json!({ "name": name, "securities": securities }); file.write_all(line.to_string().as_bytes()).await?; file.write_all(b"\n").await?; count += 1; if count % 100 == 0 { tokio::task::yield_now().await; } } let msg = format!("✓ Saved {} companies to {:?}", companies.len(), file_path); println!("{}", msg); logger::log_info(&msg).await; Ok(()) } /// Load companies from JSONL in streaming fashion pub async fn load_companies_from_jsonl_streaming( path: &Path, callback: impl Fn(String, HashMap) -> anyhow::Result<()> ) -> anyhow::Result { if !path.exists() { return Ok(0); } let content = tokio::fs::read_to_string(path).await?; let mut count = 0; for line in content.lines() { if line.trim().is_empty() { continue; } let entry: serde_json::Value = serde_json::from_str(line)?; let name = entry["name"].as_str().unwrap_or("").to_string(); let securities: HashMap = serde_json::from_value( entry["securities"].clone() )?; callback(name, securities)?; count += 1; if count % 100 == 0 { tokio::task::yield_now().await; } } Ok(count) }