added data streaming instead of laoding
This commit is contained in:
@@ -7,18 +7,24 @@ use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use chrono::{Datelike, NaiveDate};
|
||||
use std::collections::{HashMap};
|
||||
use std::path::{PathBuf};
|
||||
use std::path::{PathBuf, Path};
|
||||
|
||||
pub async fn load_existing_events(paths: &DataPaths) -> anyhow::Result<HashMap<String, CompanyEvent>> {
|
||||
let mut map = HashMap::new();
|
||||
const BATCH_SIZE: usize = 500; // Process 500 events at a time
|
||||
|
||||
/// Load events in streaming fashion to avoid memory buildup
|
||||
pub async fn load_existing_events_streaming(
|
||||
paths: &DataPaths,
|
||||
callback: impl Fn(CompanyEvent) -> anyhow::Result<()>
|
||||
) -> anyhow::Result<usize> {
|
||||
let dir = paths.corporate_events_dir();
|
||||
if !dir.exists() {
|
||||
logger::log_info("Corporate Storage: No existing events directory found").await;
|
||||
return Ok(map);
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let mut total = 0;
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
let mut loaded_count = 0;
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||
@@ -26,18 +32,84 @@ pub async fn load_existing_events(paths: &DataPaths) -> anyhow::Result<HashMap<S
|
||||
if name.starts_with("events_") && name.len() == 17 {
|
||||
let content = fs::read_to_string(&path).await?;
|
||||
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
for event in events {
|
||||
map.insert(event_key(&event), event);
|
||||
callback(event)?;
|
||||
total += 1;
|
||||
}
|
||||
loaded_count += 1;
|
||||
|
||||
// Yield to prevent blocking
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Corporate Storage: Loaded {} events from {} files", map.len(), loaded_count)).await;
|
||||
Ok(map)
|
||||
|
||||
logger::log_info(&format!("Corporate Storage: Streamed {} events", total)).await;
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_events(paths: &DataPaths, events: HashMap<String, CompanyEvent>) -> anyhow::Result<()> {
|
||||
/// Build lightweight index of events instead of loading everything
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EventIndex {
|
||||
pub key: String,
|
||||
pub ticker: String,
|
||||
pub date: String,
|
||||
pub file_path: PathBuf,
|
||||
}
|
||||
|
||||
pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventIndex>> {
|
||||
let dir = paths.corporate_events_dir();
|
||||
if !dir.exists() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut index = Vec::new();
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
if name.starts_with("events_") && name.len() == 17 {
|
||||
let content = fs::read_to_string(&path).await?;
|
||||
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
for event in events {
|
||||
index.push(EventIndex {
|
||||
key: event_key(&event),
|
||||
ticker: event.ticker.clone(),
|
||||
date: event.date.clone(),
|
||||
file_path: path.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Corporate Storage: Built index with {} entries", index.len())).await;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Lookup specific event by loading only its file
|
||||
pub async fn lookup_event_by_key(
|
||||
key: &str,
|
||||
index: &[EventIndex]
|
||||
) -> anyhow::Result<Option<CompanyEvent>> {
|
||||
let entry = index.iter().find(|e| e.key == key);
|
||||
|
||||
if let Some(entry) = entry {
|
||||
let content = fs::read_to_string(&entry.file_path).await?;
|
||||
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
||||
Ok(events.into_iter().find(|e| event_key(e) == key))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn save_optimized_events(
|
||||
paths: &DataPaths,
|
||||
events: Vec<CompanyEvent> // Changed from HashMap to Vec
|
||||
) -> anyhow::Result<()> {
|
||||
let dir = paths.corporate_events_dir();
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
@@ -55,15 +127,23 @@ pub async fn save_optimized_events(paths: &DataPaths, events: HashMap<String, Co
|
||||
logger::log_info(&format!("Corporate Storage: Removed {} old event files", removed_count)).await;
|
||||
|
||||
let total_events = events.len();
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| (e.ticker.clone(), e.date.clone()));
|
||||
let mut sorted = events;
|
||||
sorted.sort_by(|a, b| {
|
||||
a.ticker.cmp(&b.ticker)
|
||||
.then(a.date.cmp(&b.date))
|
||||
});
|
||||
|
||||
// Process in batches to avoid memory buildup
|
||||
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
|
||||
for e in sorted {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") {
|
||||
let key = format!("{}-{:02}", d.year(), d.month());
|
||||
by_month.entry(key).or_default().push(e);
|
||||
|
||||
for chunk in sorted.chunks(BATCH_SIZE) {
|
||||
for e in chunk {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") {
|
||||
let key = format!("{}-{:02}", d.year(), d.month());
|
||||
by_month.entry(key).or_default().push(e.clone());
|
||||
}
|
||||
}
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
|
||||
let total_months = by_month.len();
|
||||
@@ -72,6 +152,7 @@ pub async fn save_optimized_events(paths: &DataPaths, events: HashMap<String, Co
|
||||
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
||||
logger::log_info(&format!("Corporate Storage: Saved {} events for month {}", list.len(), month)).await;
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Corporate Storage: Saved {} total events in {} month files", total_events, total_months)).await;
|
||||
Ok(())
|
||||
}
|
||||
@@ -108,7 +189,12 @@ pub async fn save_changes(paths: &DataPaths, changes: &[CompanyEventChange]) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_prices_for_ticker(paths: &DataPaths, ticker: &str, timeframe: &str, mut prices: Vec<CompanyPrice>) -> anyhow::Result<()> {
|
||||
pub async fn save_prices_for_ticker(
|
||||
paths: &DataPaths,
|
||||
ticker: &str,
|
||||
timeframe: &str,
|
||||
mut prices: Vec<CompanyPrice>
|
||||
) -> anyhow::Result<()> {
|
||||
let base_dir = paths.corporate_prices_dir();
|
||||
let company_dir = base_dir.join(ticker.replace(".", "_"));
|
||||
let timeframe_dir = company_dir.join(timeframe);
|
||||
@@ -142,7 +228,11 @@ pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Resul
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_available_exchanges(paths: &DataPaths, isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
|
||||
pub async fn save_available_exchanges(
|
||||
paths: &DataPaths,
|
||||
isin: &str,
|
||||
exchanges: Vec<AvailableExchange>
|
||||
) -> anyhow::Result<()> {
|
||||
let dir = get_company_dir(paths, isin);
|
||||
fs::create_dir_all(&dir).await?;
|
||||
let path = dir.join("available_exchanges.json");
|
||||
@@ -177,66 +267,8 @@ pub async fn save_prices_by_source(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update available_exchanges.json with fetch results
|
||||
/*pub async fn update_available_exchange(
|
||||
paths: &DataPaths,
|
||||
isin: &str,
|
||||
ticker: &str,
|
||||
exchange_mic: &str,
|
||||
has_daily: bool,
|
||||
has_5min: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut exchanges = load_available_exchanges(paths, isin).await?;
|
||||
|
||||
if let Some(entry) = exchanges.iter_mut().find(|e| e.ticker == ticker) {
|
||||
// Update existing entry
|
||||
entry.record_success(has_daily, has_5min);
|
||||
} else {
|
||||
// Create new entry - need to get currency from somewhere
|
||||
// Try to infer from the ticker or use a default
|
||||
let currency = infer_currency_from_ticker(ticker);
|
||||
let mut new_entry = AvailableExchange::new(
|
||||
ticker.to_string(),
|
||||
exchange_mic.to_string(),
|
||||
currency,
|
||||
);
|
||||
new_entry.record_success(has_daily, has_5min);
|
||||
exchanges.push(new_entry);
|
||||
}
|
||||
|
||||
save_available_exchanges(paths, isin, exchanges).await
|
||||
}*/
|
||||
|
||||
/// Infer currency from ticker suffix
|
||||
fn infer_currency_from_ticker(ticker: &str) -> String {
|
||||
if ticker.ends_with(".L") { return "GBP".to_string(); }
|
||||
if ticker.ends_with(".PA") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".DE") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".AS") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".MI") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".SW") { return "CHF".to_string(); }
|
||||
if ticker.ends_with(".T") { return "JPY".to_string(); }
|
||||
if ticker.ends_with(".HK") { return "HKD".to_string(); }
|
||||
if ticker.ends_with(".SS") { return "CNY".to_string(); }
|
||||
if ticker.ends_with(".SZ") { return "CNY".to_string(); }
|
||||
if ticker.ends_with(".TO") { return "CAD".to_string(); }
|
||||
if ticker.ends_with(".AX") { return "AUD".to_string(); }
|
||||
if ticker.ends_with(".SA") { return "BRL".to_string(); }
|
||||
if ticker.ends_with(".MC") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".BO") || ticker.ends_with(".NS") { return "INR".to_string(); }
|
||||
|
||||
"USD".to_string() // Default
|
||||
}
|
||||
|
||||
/// Saves companies data to a JSONL file.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `paths` - Reference to DataPaths for directory management
|
||||
/// * `companies` - HashMap of company names to their securities (ISIN, Ticker pairs)
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if file operations or serialization fails.
|
||||
pub async fn save_companies_to_jsonl(
|
||||
/// Saves companies data to a JSONL file in streaming fashion
|
||||
pub async fn save_companies_to_jsonl_streaming(
|
||||
paths: &DataPaths,
|
||||
companies: &HashMap<String, HashMap<String, String>>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -244,13 +276,14 @@ pub async fn save_companies_to_jsonl(
|
||||
|
||||
logger::log_info(&format!("Corporate Storage: Saving {} companies to JSONL", companies.len())).await;
|
||||
|
||||
// Create parent directory if it doesn't exist
|
||||
if let Some(parent) = file_path.parent() {
|
||||
tokio::fs::create_dir_all(parent).await?;
|
||||
}
|
||||
|
||||
let mut file = tokio::fs::File::create(&file_path).await?;
|
||||
let mut count = 0;
|
||||
|
||||
// Process in batches
|
||||
for (name, securities) in companies.iter() {
|
||||
let line = serde_json::json!({
|
||||
"name": name,
|
||||
@@ -258,10 +291,49 @@ pub async fn save_companies_to_jsonl(
|
||||
});
|
||||
file.write_all(line.to_string().as_bytes()).await?;
|
||||
file.write_all(b"\n").await?;
|
||||
|
||||
count += 1;
|
||||
if count % 100 == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
|
||||
let msg = format!("✓ Saved {} companies to {:?}", companies.len(), file_path);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load companies from JSONL in streaming fashion
|
||||
pub async fn load_companies_from_jsonl_streaming(
|
||||
path: &Path,
|
||||
callback: impl Fn(String, HashMap<String, String>) -> anyhow::Result<()>
|
||||
) -> anyhow::Result<usize> {
|
||||
if !path.exists() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let content = tokio::fs::read_to_string(path).await?;
|
||||
let mut count = 0;
|
||||
|
||||
for line in content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entry: serde_json::Value = serde_json::from_str(line)?;
|
||||
let name = entry["name"].as_str().unwrap_or("").to_string();
|
||||
let securities: HashMap<String, String> = serde_json::from_value(
|
||||
entry["securities"].clone()
|
||||
)?;
|
||||
|
||||
callback(name, securities)?;
|
||||
count += 1;
|
||||
|
||||
if count % 100 == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
Reference in New Issue
Block a user