Files
WebScraper/src/economic/storage.rs

223 lines
7.7 KiB
Rust

// src/economic/storage.rs
use super::types::*;
use super::helpers::*;
use crate::util::directories::DataPaths;
use crate::util::logger;
use tokio::fs;
use chrono::{NaiveDate, Datelike};
use std::collections::HashMap;
use serde_json;
const CHUNK_SIZE: usize = 500; // Process 500 events at a time
const MAX_EVENTS_PER_FILE: usize = 3000;
pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> {
let dir = paths.economic_events_dir();
let mut chunks = Vec::new();
if dir.exists() {
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().map(|e| e == "json").unwrap_or(false) {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.starts_with("chunk_") {
// Don't load the events here, just record the chunk info
let start = name[6..16].to_string();
let end = name[17..27].to_string();
chunks.push(ChunkInfo {
start_date: start,
end_date: end,
path,
event_count: 0 // We'll count later if needed
});
}
}
}
}
}
chunks.sort_by_key(|c| c.start_date.clone());
logger::log_info(&format!("Economic Storage: Found {} event chunks", chunks.len())).await;
Ok(chunks)
}
/// Stream events from a single chunk file
pub async fn stream_chunk_events(
chunk: &ChunkInfo,
callback: impl Fn(EconomicEvent) -> anyhow::Result<()>
) -> anyhow::Result<usize> {
let content = fs::read_to_string(&chunk.path).await?;
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
let count = events.len();
for event in events {
callback(event)?;
}
Ok(count)
}
/// Load events in batches to avoid memory explosion
pub async fn load_events_in_batches(
chunks: &[ChunkInfo],
batch_size: usize,
) -> anyhow::Result<impl Iterator<Item = (String, EconomicEvent)>> {
let mut all_events = Vec::new();
for chunk in chunks {
logger::log_info(&format!("Loading chunk: {:?}", chunk.path.file_name())).await;
let content = fs::read_to_string(&chunk.path).await?;
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
for e in events {
all_events.push((event_key(&e), e));
}
// If we've accumulated enough, yield them
if all_events.len() >= batch_size {
break;
}
}
logger::log_info(&format!("Loaded {} events in batch", all_events.len())).await;
Ok(all_events.into_iter())
}
/// Build a lightweight index instead of loading all events
#[derive(Debug, Clone)]
pub struct EventIndex {
pub key: String,
pub identity_key: String,
pub date: String,
pub chunk_file: std::path::PathBuf,
}
pub async fn build_event_index(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<EventIndex>> {
let mut index = Vec::new();
for chunk in chunks {
logger::log_info(&format!("Indexing chunk: {:?}", chunk.path.file_name())).await;
let content = fs::read_to_string(&chunk.path).await?;
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
for e in events {
index.push(EventIndex {
key: event_key(&e),
identity_key: identity_key(&e),
date: e.date.clone(),
chunk_file: chunk.path.clone(),
});
}
}
logger::log_info(&format!("Built index with {} entries", index.len())).await;
Ok(index)
}
/// Look up a specific event by loading only its chunk
pub async fn lookup_event_by_key(key: &str, index: &[EventIndex]) -> anyhow::Result<Option<EconomicEvent>> {
// Find which chunk contains this event
let entry = index.iter().find(|e| e.key == key);
if let Some(entry) = entry {
// Load only that chunk
let content = fs::read_to_string(&entry.chunk_file).await?;
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
// Find the specific event
Ok(events.into_iter().find(|e| event_key(e) == key))
} else {
Ok(None)
}
}
/// Save events in smaller, more manageable chunks
pub async fn save_optimized_chunks(
paths: &DataPaths,
events: Vec<EconomicEvent> // Changed from HashMap to Vec
) -> anyhow::Result<()> {
let dir = paths.economic_events_dir();
fs::create_dir_all(dir).await?;
logger::log_info("Economic Storage: Removing old chunk files...").await;
let mut entries = fs::read_dir(dir).await?;
let mut removed_count = 0;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
fs::remove_file(&path).await?;
removed_count += 1;
}
}
}
logger::log_info(&format!("Economic Storage: Removed {} old chunk files", removed_count)).await;
let mut sorted = events;
sorted.sort_by(|a, b| a.date.cmp(&b.date));
// Save in smaller chunks
let mut chunk_num = 0;
for chunk in sorted.chunks(MAX_EVENTS_PER_FILE) {
save_chunk_vec(chunk, dir, chunk_num).await?;
chunk_num += 1;
// Allow other tasks to run
tokio::task::yield_now().await;
}
logger::log_info(&format!("Economic Storage: Saved {} chunks to {:?}", chunk_num, dir)).await;
Ok(())
}
async fn save_chunk_vec(events: &[EconomicEvent], dir: &std::path::Path, chunk_num: usize) -> anyhow::Result<()> {
if events.is_empty() {
return Ok(());
}
let start = &events[0].date;
let end = &events[events.len() - 1].date;
let path = dir.join(format!("chunk_{:04}_{}_{}.json", chunk_num, start, end));
// Write incrementally to avoid large memory allocation
let json = serde_json::to_string_pretty(events)?;
fs::write(&path, json).await?;
logger::log_info(&format!("Economic Storage: Saved chunk {} - {} ({} events)", start, end, events.len())).await;
Ok(())
}
pub async fn save_changes(paths: &DataPaths, changes: &[EventChange]) -> anyhow::Result<()> {
if changes.is_empty() {
logger::log_info("Economic Storage: No changes to save").await;
return Ok(());
}
let dir = paths.economic_changes_dir();
fs::create_dir_all(dir).await?;
logger::log_info(&format!("Economic Storage: Saving {} changes to {:?}", changes.len(), dir)).await;
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
for c in changes {
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
let key = format!("{:02}_{}", d.month(), d.year());
by_month.entry(key).or_default().push(c.clone());
}
}
for (month, list) in by_month {
let path = dir.join(format!("event_changes_{}.json", month));
let mut all = if path.exists() {
let s = fs::read_to_string(&path).await?;
serde_json::from_str(&s).unwrap_or_default()
} else { vec![] };
all.extend(list.clone());
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
logger::log_info(&format!("Economic Storage: Saved {} changes for month {}", list.len(), month)).await;
}
logger::log_info("Economic Storage: All changes saved successfully").await;
Ok(())
}