added data streaming instead of laoding
This commit is contained in:
@@ -6,6 +6,10 @@ use crate::util::logger;
|
||||
use tokio::fs;
|
||||
use chrono::{NaiveDate, Datelike};
|
||||
use std::collections::HashMap;
|
||||
use serde_json;
|
||||
|
||||
const CHUNK_SIZE: usize = 500; // Process 500 events at a time
|
||||
const MAX_EVENTS_PER_FILE: usize = 3000;
|
||||
|
||||
pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
let dir = paths.economic_events_dir();
|
||||
@@ -18,37 +22,122 @@ pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<Chunk
|
||||
if path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
if name.starts_with("chunk_") {
|
||||
if let Some(content) = fs::read_to_string(&path).await.ok() {
|
||||
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||
let start = name[6..16].to_string();
|
||||
let end = name[17..27].to_string();
|
||||
chunks.push(ChunkInfo { start_date: start, end_date: end, path, event_count: events.len() });
|
||||
}
|
||||
}
|
||||
// Don't load the events here, just record the chunk info
|
||||
let start = name[6..16].to_string();
|
||||
let end = name[17..27].to_string();
|
||||
chunks.push(ChunkInfo {
|
||||
start_date: start,
|
||||
end_date: end,
|
||||
path,
|
||||
event_count: 0 // We'll count later if needed
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
chunks.sort_by_key(|c| c.start_date.clone());
|
||||
logger::log_info(&format!("Economic Storage: Scanned {} event chunks", chunks.len())).await;
|
||||
logger::log_info(&format!("Economic Storage: Found {} event chunks", chunks.len())).await;
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMap<String, EconomicEvent>> {
|
||||
let mut map = HashMap::new();
|
||||
for chunk in chunks {
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
for e in events {
|
||||
map.insert(event_key(&e), e);
|
||||
}
|
||||
/// Stream events from a single chunk file
|
||||
pub async fn stream_chunk_events(
|
||||
chunk: &ChunkInfo,
|
||||
callback: impl Fn(EconomicEvent) -> anyhow::Result<()>
|
||||
) -> anyhow::Result<usize> {
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
let count = events.len();
|
||||
|
||||
for event in events {
|
||||
callback(event)?;
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Loaded {} events from {} chunks", map.len(), chunks.len())).await;
|
||||
Ok(map)
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_chunks(paths: &DataPaths, events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
||||
/// Load events in batches to avoid memory explosion
|
||||
pub async fn load_events_in_batches(
|
||||
chunks: &[ChunkInfo],
|
||||
batch_size: usize,
|
||||
) -> anyhow::Result<impl Iterator<Item = (String, EconomicEvent)>> {
|
||||
let mut all_events = Vec::new();
|
||||
|
||||
for chunk in chunks {
|
||||
logger::log_info(&format!("Loading chunk: {:?}", chunk.path.file_name())).await;
|
||||
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
for e in events {
|
||||
all_events.push((event_key(&e), e));
|
||||
}
|
||||
|
||||
// If we've accumulated enough, yield them
|
||||
if all_events.len() >= batch_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Loaded {} events in batch", all_events.len())).await;
|
||||
Ok(all_events.into_iter())
|
||||
}
|
||||
|
||||
/// NEW: Build a lightweight index instead of loading all events
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EventIndex {
|
||||
pub key: String,
|
||||
pub identity_key: String,
|
||||
pub date: String,
|
||||
pub chunk_file: std::path::PathBuf,
|
||||
}
|
||||
|
||||
pub async fn build_event_index(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<EventIndex>> {
|
||||
let mut index = Vec::new();
|
||||
|
||||
for chunk in chunks {
|
||||
logger::log_info(&format!("Indexing chunk: {:?}", chunk.path.file_name())).await;
|
||||
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
for e in events {
|
||||
index.push(EventIndex {
|
||||
key: event_key(&e),
|
||||
identity_key: identity_key(&e),
|
||||
date: e.date.clone(),
|
||||
chunk_file: chunk.path.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Built index with {} entries", index.len())).await;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// NEW: Look up a specific event by loading only its chunk
|
||||
pub async fn lookup_event_by_key(key: &str, index: &[EventIndex]) -> anyhow::Result<Option<EconomicEvent>> {
|
||||
// Find which chunk contains this event
|
||||
let entry = index.iter().find(|e| e.key == key);
|
||||
|
||||
if let Some(entry) = entry {
|
||||
// Load only that chunk
|
||||
let content = fs::read_to_string(&entry.chunk_file).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
// Find the specific event
|
||||
Ok(events.into_iter().find(|e| event_key(e) == key))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Save events in smaller, more manageable chunks
|
||||
pub async fn save_optimized_chunks(
|
||||
paths: &DataPaths,
|
||||
events: Vec<EconomicEvent> // Changed from HashMap to Vec
|
||||
) -> anyhow::Result<()> {
|
||||
let dir = paths.economic_events_dir();
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
@@ -67,31 +156,36 @@ pub async fn save_optimized_chunks(paths: &DataPaths, events: HashMap<String, Ec
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Removed {} old chunk files", removed_count)).await;
|
||||
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| e.date.clone());
|
||||
let mut sorted = events;
|
||||
sorted.sort_by(|a, b| a.date.cmp(&b.date));
|
||||
|
||||
let mut chunk: Vec<EconomicEvent> = Vec::new();
|
||||
const MAX_EVENTS_PER_CHUNK: usize = ( 30000 / 2 ) / 11; // (30000 - 2) / 11 = 2727
|
||||
|
||||
for e in sorted {
|
||||
if !chunk.is_empty() && chunk.len() >= MAX_EVENTS_PER_CHUNK {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
chunk.clear();
|
||||
}
|
||||
chunk.push(e);
|
||||
// Save in smaller chunks
|
||||
let mut chunk_num = 0;
|
||||
for chunk in sorted.chunks(MAX_EVENTS_PER_FILE) {
|
||||
save_chunk_vec(chunk, dir, chunk_num).await?;
|
||||
chunk_num += 1;
|
||||
|
||||
// Allow other tasks to run
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
if !chunk.is_empty() {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Saved all event chunks to {:?}", dir)).await;
|
||||
|
||||
logger::log_info(&format!("Economic Storage: Saved {} chunks to {:?}", chunk_num, dir)).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::Result<()> {
|
||||
let start = events.iter().map(|e| &e.date).min().unwrap().clone();
|
||||
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
||||
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
||||
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
||||
async fn save_chunk_vec(events: &[EconomicEvent], dir: &std::path::Path, chunk_num: usize) -> anyhow::Result<()> {
|
||||
if events.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let start = &events[0].date;
|
||||
let end = &events[events.len() - 1].date;
|
||||
let path = dir.join(format!("chunk_{:04}_{}_{}.json", chunk_num, start, end));
|
||||
|
||||
// Write incrementally to avoid large memory allocation
|
||||
let json = serde_json::to_string_pretty(events)?;
|
||||
fs::write(&path, json).await?;
|
||||
|
||||
logger::log_info(&format!("Economic Storage: Saved chunk {} - {} ({} events)", start, end, events.len())).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,15 +3,9 @@ use super::{scraper::*, storage::*, helpers::*, types::*};
|
||||
use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
|
||||
use chrono::{Local};
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Runs the full update for economic data, using the provided ChromeDriver pool.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `config` - The application configuration.
|
||||
/// * `pool` - Shared pool of ChromeDriver instances for scraping.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if scraping, loading, or saving fails.
|
||||
/// Runs the full update for economic data using streaming to minimize memory usage
|
||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
||||
let paths = DataPaths::new(".")?;
|
||||
|
||||
@@ -20,81 +14,124 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
|
||||
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
||||
let end_date = config.target_end_date();
|
||||
|
||||
logger::log_info(&format!("Economic Update: Scanning existing chunks from {:?}", paths.economic_events_dir())).await;
|
||||
// Step 1: Build lightweight index instead of loading all events
|
||||
logger::log_info("Economic Update: Building event index...").await;
|
||||
let chunks = scan_existing_chunks(&paths).await?;
|
||||
let mut events = load_existing_events(&chunks).await?;
|
||||
let event_index = build_event_index(&chunks).await?;
|
||||
|
||||
let msg = format!("Economic Update: Loaded {} events from {} chunks", events.len(), chunks.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
logger::log_info(&format!("Economic Update: Indexed {} events from {} chunks",
|
||||
event_index.len(), chunks.len())).await;
|
||||
|
||||
let start_date = if events.is_empty() {
|
||||
// Step 2: Determine start date
|
||||
let start_date = if event_index.is_empty() {
|
||||
logger::log_warn("Economic Update: No existing events found, starting from config date").await;
|
||||
config.economic_start_date.clone()
|
||||
} else if events.values().any(|e| e.date >= today_str) {
|
||||
logger::log_info("Economic Update: Events exist for today, starting from today").await;
|
||||
today_str.clone()
|
||||
} else {
|
||||
let next = events.values()
|
||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
// Find the latest date in the index
|
||||
let max_date = event_index.iter()
|
||||
.map(|e| &e.date)
|
||||
.max()
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.cloned()
|
||||
.unwrap_or(today_str.clone());
|
||||
logger::log_info(&format!("Economic Update: Resuming from: {}", next)).await;
|
||||
next
|
||||
|
||||
if max_date >= today_str {
|
||||
logger::log_info("Economic Update: Events exist for today, starting from today").await;
|
||||
today_str.clone()
|
||||
} else {
|
||||
let next = chrono::NaiveDate::parse_from_str(&max_date, "%Y-%m-%d")
|
||||
.ok()
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or(today_str.clone());
|
||||
logger::log_info(&format!("Economic Update: Resuming from: {}", next)).await;
|
||||
next
|
||||
}
|
||||
};
|
||||
|
||||
let msg = format!("Economic Update: Scraping events from {} → {}", start_date, end_date);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
logger::log_info(&format!("Economic Update: Scraping events from {} → {}", start_date, end_date)).await;
|
||||
|
||||
// Pass the pool to the scraping function
|
||||
let new_events_all = scrape_all_economic_events(&start_date, &end_date, pool).await?;
|
||||
// Step 3: Scrape new events in batches
|
||||
let new_events = scrape_all_economic_events(&start_date, &end_date, pool).await?;
|
||||
|
||||
let msg = format!("Economic Update: Scraped {} new events", new_events_all.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
logger::log_info(&format!("Economic Update: Scraped {} new events", new_events.len())).await;
|
||||
|
||||
// Process all at once or in batches
|
||||
let result = process_batch(&new_events_all, &mut events, &today_str);
|
||||
let total_changes = result.changes.len();
|
||||
// Step 4: Process events in streaming fashion
|
||||
let (changes, updated_events) = process_events_streaming(&chunks, &new_events, &today_str).await?;
|
||||
|
||||
let msg = format!("Economic Update: Detected {} changes", total_changes);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
logger::log_info(&format!("Economic Update: Detected {} changes", changes.len())).await;
|
||||
|
||||
if total_changes > 0 {
|
||||
logger::log_info(&format!("Economic Update: Saving {} changes to log", total_changes)).await;
|
||||
save_changes(&paths, &result.changes).await?;
|
||||
if !changes.is_empty() {
|
||||
logger::log_info(&format!("Economic Update: Saving {} changes to log", changes.len())).await;
|
||||
save_changes(&paths, &changes).await?;
|
||||
logger::log_info("Economic Update: Changes saved successfully").await;
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Economic Update: Saving {} total events to chunks", events.len())).await;
|
||||
save_optimized_chunks(&paths, events).await?;
|
||||
// Step 5: Save consolidated events
|
||||
logger::log_info(&format!("Economic Update: Saving {} total events to chunks", updated_events.len())).await;
|
||||
save_optimized_chunks(&paths, updated_events).await?;
|
||||
|
||||
let msg = format!("✓ Economic update complete — {} changes detected", total_changes);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
logger::log_info(&format!("✓ Economic update complete — {} changes detected", changes.len())).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scrapes all economic events from start to end date using a dedicated ScrapeTask with the provided pool.
|
||||
///
|
||||
/// This function creates a ScrapeTask to navigate to the Finanzen.net page, prepare it,
|
||||
/// and then loop through date ranges to extract events.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `start` - Start date in YYYY-MM-DD.
|
||||
/// * `end` - End date in YYYY-MM-DD.
|
||||
/// * `pool` - Shared pool of ChromeDriver instances.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of all extracted EconomicEvent structs.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if task execution fails or extraction issues occur.
|
||||
pub async fn scrape_all_economic_events(start: &str, end: &str, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
/// Process events using streaming to minimize memory usage
|
||||
async fn process_events_streaming(
|
||||
chunks: &[ChunkInfo],
|
||||
new_events: &[EconomicEvent],
|
||||
today: &str,
|
||||
) -> anyhow::Result<(Vec<EventChange>, Vec<EconomicEvent>)> {
|
||||
let mut all_changes = Vec::new();
|
||||
let mut final_events: HashMap<String, EconomicEvent> = HashMap::new();
|
||||
|
||||
// Step 1: Load existing events in batches
|
||||
logger::log_info("Processing existing events in batches...").await;
|
||||
|
||||
for chunk in chunks {
|
||||
logger::log_info(&format!("Loading chunk: {:?}", chunk.path.file_name())).await;
|
||||
|
||||
let content = tokio::fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
|
||||
// Add to final events map
|
||||
for e in events {
|
||||
final_events.insert(event_key(&e), e);
|
||||
}
|
||||
|
||||
// Clear memory periodically
|
||||
if final_events.len() > 10000 {
|
||||
logger::log_info(&format!("Loaded {} events so far...", final_events.len())).await;
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Loaded {} existing events total", final_events.len())).await;
|
||||
|
||||
// Step 2: Process new events in batches
|
||||
logger::log_info("Processing new events...").await;
|
||||
|
||||
for (idx, batch) in new_events.chunks(500).enumerate() {
|
||||
logger::log_info(&format!("Processing batch {} ({} events)", idx + 1, batch.len())).await;
|
||||
|
||||
let batch_result = process_batch(batch, &mut final_events, today);
|
||||
all_changes.extend(batch_result.changes);
|
||||
|
||||
// Yield to prevent blocking
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Processing complete. Total events: {}", final_events.len())).await;
|
||||
|
||||
// Convert HashMap to Vec for saving
|
||||
let events_vec: Vec<EconomicEvent> = final_events.into_values().collect();
|
||||
|
||||
Ok((all_changes, events_vec))
|
||||
}
|
||||
|
||||
/// Scrapes all economic events from start to end date
|
||||
pub async fn scrape_all_economic_events(
|
||||
start: &str,
|
||||
end: &str,
|
||||
pool: &Arc<ChromeDriverPool>
|
||||
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/".to_string();
|
||||
let start_clone = start.to_string();
|
||||
let end_clone = end.to_string();
|
||||
@@ -108,9 +145,18 @@ pub async fn scrape_all_economic_events(start: &str, end: &str, pool: &Arc<Chrom
|
||||
set_date_range(&client, ¤t, &end_clone).await?;
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
||||
let new_events = extract_events(&client).await?;
|
||||
if new_events.is_empty() { break; }
|
||||
|
||||
if new_events.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
all_events.extend(new_events.clone());
|
||||
|
||||
// Prevent memory buildup - process in chunks if too large
|
||||
if all_events.len() > 5000 {
|
||||
logger::log_info(&format!("Scraped {} events so far, continuing...", all_events.len())).await;
|
||||
}
|
||||
|
||||
let next = new_events.iter()
|
||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
.max()
|
||||
@@ -121,16 +167,17 @@ pub async fn scrape_all_economic_events(start: &str, end: &str, pool: &Arc<Chrom
|
||||
if next > end_clone { break; }
|
||||
current = next;
|
||||
}
|
||||
|
||||
Ok(all_events)
|
||||
});
|
||||
|
||||
// Use the pool for execution
|
||||
task.execute_with_pool(pool).await
|
||||
}
|
||||
|
||||
/// Process a batch of events and detect changes
|
||||
pub fn process_batch(
|
||||
new_events: &[EconomicEvent],
|
||||
existing: &mut std::collections::HashMap<String, EconomicEvent>,
|
||||
existing: &mut HashMap<String, EconomicEvent>,
|
||||
today: &str,
|
||||
) -> ScrapeResult {
|
||||
let mut changes = Vec::new();
|
||||
|
||||
Reference in New Issue
Block a user