added data streaming instead of laoding
This commit is contained in:
@@ -6,6 +6,10 @@ use crate::util::logger;
|
||||
use tokio::fs;
|
||||
use chrono::{NaiveDate, Datelike};
|
||||
use std::collections::HashMap;
|
||||
use serde_json;
|
||||
|
||||
const CHUNK_SIZE: usize = 500; // Process 500 events at a time
|
||||
const MAX_EVENTS_PER_FILE: usize = 3000;
|
||||
|
||||
pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
let dir = paths.economic_events_dir();
|
||||
@@ -18,37 +22,122 @@ pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<Chunk
|
||||
if path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
if name.starts_with("chunk_") {
|
||||
if let Some(content) = fs::read_to_string(&path).await.ok() {
|
||||
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||
let start = name[6..16].to_string();
|
||||
let end = name[17..27].to_string();
|
||||
chunks.push(ChunkInfo { start_date: start, end_date: end, path, event_count: events.len() });
|
||||
}
|
||||
}
|
||||
// Don't load the events here, just record the chunk info
|
||||
let start = name[6..16].to_string();
|
||||
let end = name[17..27].to_string();
|
||||
chunks.push(ChunkInfo {
|
||||
start_date: start,
|
||||
end_date: end,
|
||||
path,
|
||||
event_count: 0 // We'll count later if needed
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
chunks.sort_by_key(|c| c.start_date.clone());
|
||||
logger::log_info(&format!("Economic Storage: Scanned {} event chunks", chunks.len())).await;
|
||||
logger::log_info(&format!("Economic Storage: Found {} event chunks", chunks.len())).await;
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMap<String, EconomicEvent>> {
|
||||
let mut map = HashMap::new();
|
||||
for chunk in chunks {
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
for e in events {
|
||||
map.insert(event_key(&e), e);
|
||||
}
|
||||
/// Stream events from a single chunk file
|
||||
pub async fn stream_chunk_events(
|
||||
chunk: &ChunkInfo,
|
||||
callback: impl Fn(EconomicEvent) -> anyhow::Result<()>
|
||||
) -> anyhow::Result<usize> {
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
let count = events.len();
|
||||
|
||||
for event in events {
|
||||
callback(event)?;
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Loaded {} events from {} chunks", map.len(), chunks.len())).await;
|
||||
Ok(map)
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_chunks(paths: &DataPaths, events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
||||
/// Load events in batches to avoid memory explosion
|
||||
pub async fn load_events_in_batches(
|
||||
chunks: &[ChunkInfo],
|
||||
batch_size: usize,
|
||||
) -> anyhow::Result<impl Iterator<Item = (String, EconomicEvent)>> {
|
||||
let mut all_events = Vec::new();
|
||||
|
||||
for chunk in chunks {
|
||||
logger::log_info(&format!("Loading chunk: {:?}", chunk.path.file_name())).await;
|
||||
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
for e in events {
|
||||
all_events.push((event_key(&e), e));
|
||||
}
|
||||
|
||||
// If we've accumulated enough, yield them
|
||||
if all_events.len() >= batch_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Loaded {} events in batch", all_events.len())).await;
|
||||
Ok(all_events.into_iter())
|
||||
}
|
||||
|
||||
/// NEW: Build a lightweight index instead of loading all events
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EventIndex {
|
||||
pub key: String,
|
||||
pub identity_key: String,
|
||||
pub date: String,
|
||||
pub chunk_file: std::path::PathBuf,
|
||||
}
|
||||
|
||||
pub async fn build_event_index(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<EventIndex>> {
|
||||
let mut index = Vec::new();
|
||||
|
||||
for chunk in chunks {
|
||||
logger::log_info(&format!("Indexing chunk: {:?}", chunk.path.file_name())).await;
|
||||
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
for e in events {
|
||||
index.push(EventIndex {
|
||||
key: event_key(&e),
|
||||
identity_key: identity_key(&e),
|
||||
date: e.date.clone(),
|
||||
chunk_file: chunk.path.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Built index with {} entries", index.len())).await;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// NEW: Look up a specific event by loading only its chunk
|
||||
pub async fn lookup_event_by_key(key: &str, index: &[EventIndex]) -> anyhow::Result<Option<EconomicEvent>> {
|
||||
// Find which chunk contains this event
|
||||
let entry = index.iter().find(|e| e.key == key);
|
||||
|
||||
if let Some(entry) = entry {
|
||||
// Load only that chunk
|
||||
let content = fs::read_to_string(&entry.chunk_file).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
// Find the specific event
|
||||
Ok(events.into_iter().find(|e| event_key(e) == key))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Save events in smaller, more manageable chunks
|
||||
pub async fn save_optimized_chunks(
|
||||
paths: &DataPaths,
|
||||
events: Vec<EconomicEvent> // Changed from HashMap to Vec
|
||||
) -> anyhow::Result<()> {
|
||||
let dir = paths.economic_events_dir();
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
@@ -67,31 +156,36 @@ pub async fn save_optimized_chunks(paths: &DataPaths, events: HashMap<String, Ec
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Removed {} old chunk files", removed_count)).await;
|
||||
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| e.date.clone());
|
||||
let mut sorted = events;
|
||||
sorted.sort_by(|a, b| a.date.cmp(&b.date));
|
||||
|
||||
let mut chunk: Vec<EconomicEvent> = Vec::new();
|
||||
const MAX_EVENTS_PER_CHUNK: usize = ( 30000 / 2 ) / 11; // (30000 - 2) / 11 = 2727
|
||||
|
||||
for e in sorted {
|
||||
if !chunk.is_empty() && chunk.len() >= MAX_EVENTS_PER_CHUNK {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
chunk.clear();
|
||||
}
|
||||
chunk.push(e);
|
||||
// Save in smaller chunks
|
||||
let mut chunk_num = 0;
|
||||
for chunk in sorted.chunks(MAX_EVENTS_PER_FILE) {
|
||||
save_chunk_vec(chunk, dir, chunk_num).await?;
|
||||
chunk_num += 1;
|
||||
|
||||
// Allow other tasks to run
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
if !chunk.is_empty() {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Saved all event chunks to {:?}", dir)).await;
|
||||
|
||||
logger::log_info(&format!("Economic Storage: Saved {} chunks to {:?}", chunk_num, dir)).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::Result<()> {
|
||||
let start = events.iter().map(|e| &e.date).min().unwrap().clone();
|
||||
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
||||
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
||||
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
||||
async fn save_chunk_vec(events: &[EconomicEvent], dir: &std::path::Path, chunk_num: usize) -> anyhow::Result<()> {
|
||||
if events.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let start = &events[0].date;
|
||||
let end = &events[events.len() - 1].date;
|
||||
let path = dir.join(format!("chunk_{:04}_{}_{}.json", chunk_num, start, end));
|
||||
|
||||
// Write incrementally to avoid large memory allocation
|
||||
let json = serde_json::to_string_pretty(events)?;
|
||||
fs::write(&path, json).await?;
|
||||
|
||||
logger::log_info(&format!("Economic Storage: Saved chunk {} - {} ({} events)", start, end, events.len())).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user