Files
WebScraper/src/economic/storage.rs

114 lines
4.2 KiB
Rust

// src/economic/storage.rs
use super::types::*;
use super::helpers::*;
use tokio::fs;
use chrono::{NaiveDate, Datelike};
use std::collections::HashMap;
pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
let dir = std::path::Path::new("economic_events");
let mut chunks = Vec::new();
if dir.exists() {
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().map(|e| e == "json").unwrap_or(false) {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.starts_with("chunk_") {
if let Some(content) = fs::read_to_string(&path).await.ok() {
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
let start = name[6..16].to_string();
let end = name[17..27].to_string();
chunks.push(ChunkInfo { start_date: start, end_date: end, path, event_count: events.len() });
}
}
}
}
}
}
}
chunks.sort_by_key(|c| c.start_date.clone());
Ok(chunks)
}
pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMap<String, EconomicEvent>> {
let mut map = HashMap::new();
for chunk in chunks {
let content = fs::read_to_string(&chunk.path).await?;
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
for e in events {
map.insert(event_key(&e), e);
}
}
Ok(map)
}
pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
let dir = std::path::Path::new("economic_events");
fs::create_dir_all(dir).await?;
// Delete all old chunk files to prevent duplicates and overlaps
println!("Removing old chunks...");
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
fs::remove_file(&path).await?;
}
}
}
let mut sorted: Vec<_> = events.into_values().collect();
sorted.sort_by_key(|e| e.date.clone());
let mut chunk: Vec<EconomicEvent> = Vec::new();
const MAX_EVENTS_PER_CHUNK: usize = ( 30000 / 2 ) / 11; // (30000 - 2) / 11 = 2727
for e in sorted {
if !chunk.is_empty() && chunk.len() >= MAX_EVENTS_PER_CHUNK {
save_chunk(&chunk, dir).await?;
chunk.clear();
}
chunk.push(e);
}
if !chunk.is_empty() {
save_chunk(&chunk, dir).await?;
}
Ok(())
}
async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::Result<()> {
let start = events.iter().map(|e| &e.date).min().unwrap().clone();
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
let path = dir.join(format!("chunk_{}_{}.json", start, end));
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
Ok(())
}
pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
if changes.is_empty() { return Ok(()); }
let dir = std::path::Path::new("economic_event_changes");
fs::create_dir_all(dir).await?;
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
for c in changes {
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
let key = format!("{:02}_{}", d.month(), d.year());
by_month.entry(key).or_default().push(c.clone());
}
}
for (month, list) in by_month {
let path = dir.join(format!("event_changes_{}.json", month));
let mut all = if path.exists() {
let s = fs::read_to_string(&path).await?;
serde_json::from_str(&s).unwrap_or_default()
} else { vec![] };
all.extend(list);
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
}
Ok(())
}