added data streaming instead of laoding

2025-12-12 10:54:01 +01:00
parent 1bda78897b
commit 00c9d45642
7 changed files with 888 additions and 398 deletions
--- a/src/corporate/update.rs
+++ b/src/corporate/update.rs
@@ -9,161 +9,234 @@ use chrono::Local;
 use std::collections::{HashMap};
 use std::sync::Arc;

-/// Main function: Full update for all companies (LEI-based) with optimized parallel execution.
-///
-/// This function coordinates the entire update process:
-/// - Loads GLEIF mappings
-/// - Builds FIGI-LEI map
-/// - Loads existing events
-/// - Processes each company: discovers exchanges via FIGI, fetches prices & earnings, aggregates data
-/// - Uses the provided shared ChromeDriver pool for efficient parallel scraping
-/// - Saves optimized events
-///
-/// # Arguments
-/// * `config` - The application configuration.
-/// * `pool` - Shared pool of ChromeDriver instances for scraping.
-///
-/// # Errors
-/// Returns an error if any step in the update process fails.
+/// Main function: Full update for all companies with streaming to minimize memory usage
 pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
-    let msg = "=== Starting LEI-based corporate full update ===";
+    let msg = "=== Starting LEI-based corporate full update (STREAMING) ===";
    println!("{}", msg);
    logger::log_info(msg).await;

-    // Initialize paths
    let paths = DataPaths::new(".")?;

-    // 1. Load fresh GLEIF ISIN ↔ LEI mapping
-    logger::log_info("Corporate Update: Loading GLEIF ISIN ↔ LEI mapping...").await;
-    let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
-        Ok(map) => {
-            let msg = format!("Corporate Update: Loaded GLEIF mapping with {} LEI entries", map.len());
-            println!("{}", msg);
-            logger::log_info(&msg).await;
-            map
+    // Step 1: Download/locate GLEIF CSV (don't load into memory yet)
+    logger::log_info("Corporate Update: Downloading/locating GLEIF CSV...").await;
+    let gleif_csv_path = match download_isin_lei_csv().await? {
+        Some(p) => {
+            logger::log_info(&format!("Corporate Update: GLEIF CSV at: {}", p)).await;
+            p
        }
-        Err(e) => {
-            let msg = format!("Corporate Update: Warning - Could not load GLEIF ISIN↔LEI mapping: {}", e);
-            eprintln!("{}", msg);
-            logger::log_warn(&msg).await;
-            HashMap::new()
+        None => {
+            logger::log_warn("Corporate Update: Could not obtain GLEIF CSV, continuing with limited data").await;
+            return Ok(());
        }
    };

-    // 2. Load OpenFIGI mapping value lists (cached)
+    // Step 2: Load OpenFIGI type lists (small, cached)
    logger::log_info("Corporate Update: Loading OpenFIGI type lists...").await;
    if let Err(e) = load_figi_type_lists().await {
-        let msg = format!("Corporate Update: Warning - Could not load OpenFIGI type lists: {}", e);
-        eprintln!("{}", msg);
-        logger::log_warn(&msg).await;
+        logger::log_warn(&format!("Could not load OpenFIGI type lists: {}", e)).await;
    }
-    logger::log_info("Corporate Update: OpenFIGI type lists loaded").await;

-    // 3. Build FIGI → LEI map
-    logger::log_info("Corporate Update: Building FIGI → LEI map...").await;
-    let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins, None).await {
-        Ok(map) => {
-            let msg = format!("Corporate Update: Built FIGI map with {} entries", map.len());
-            println!("{}", msg);
-            logger::log_info(&msg).await;
-            map
-        }
-        Err(e) => {
-            let msg = format!("Corporate Update: Warning - Could not build FIGI→LEI map: {}", e);
-            eprintln!("{}", msg);
-            logger::log_warn(&msg).await;
-            HashMap::new()
+    // Step 3: Process GLEIF → FIGI mapping in streaming fashion
+    logger::log_info("Corporate Update: Building FIGI mappings (streaming)...").await;
+    
+    // Build LEI→ISINs map by streaming the CSV
+    let mut lei_to_isins: HashMap<String, Vec<String>> = HashMap::new();
+    let mut lei_batch = Vec::new();
+    const LEI_BATCH_SIZE: usize = 1000;
+    
+    stream_gleif_csv(&gleif_csv_path, |lei, isin| {
+        lei_to_isins.entry(lei.clone()).or_default().push(isin);
+        lei_batch.push(lei);
+        
+        // Process in batches
+        if lei_batch.len() >= LEI_BATCH_SIZE {
+            lei_batch.clear();
        }
+        
+        Ok(())
+    }).await?;
+    
+    logger::log_info(&format!("Corporate Update: Collected {} LEIs", lei_to_isins.len())).await;
+
+    // Step 4: Build FIGI mappings in batches (process and save incrementally)
+    logger::log_info("Corporate Update: Processing FIGI mappings in batches...").await;
+    let figi_result = build_lei_to_figi_infos(&lei_to_isins, None).await;
+    
+    // Don't keep the full result in memory - it's already saved to JSONL files
+    drop(figi_result);
+    drop(lei_to_isins); // Release this too
+    
+    logger::log_info("Corporate Update: FIGI mappings saved to cache").await;
+
+    // Step 5: Load or build securities (streaming from JSONL files)
+    logger::log_info("Corporate Update: Building securities map (streaming)...").await;
+    
+    let dir = DataPaths::new(".")?;
+    let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
+    
+    // Find the most recent date directory
+    let date_dir = find_most_recent_date_dir(&map_cache_dir).await?;
+    
+    let (common_stocks, _warrants, _options) = if let Some(date_dir) = date_dir {
+        logger::log_info(&format!("Using FIGI data from: {:?}", date_dir)).await;
+        load_or_build_all_securities_streaming(&date_dir).await?
+    } else {
+        logger::log_warn("No FIGI date directory found, using empty maps").await;
+        (HashMap::new(), HashMap::new(), HashMap::new())
    };

-    // 4. Load or build companies
-    logger::log_info("Corporate Update: Loading/building company securities...").await;
-    let securities = load_or_build_all_securities(&figi_to_lei).await?;
-    let msg = format!("Corporate Update: Processing {} companies", securities.0.len());
-    println!("{}", msg);
-    logger::log_info(&msg).await;
+    logger::log_info(&format!("Corporate Update: Processing {} companies", common_stocks.len())).await;

-    // HashMap<Name, HashMap<ISIN, Ticker>> - unique pairs only
-    let companies: HashMap<String, HashMap<String, String>> = securities.0
-        .iter()
-        .fold(HashMap::new(), |mut acc, security| {
-            let mut isin_ticker_pairs: HashMap<String, String> = HashMap::new();
-            
-            // Collect all unique ISIN-Ticker pairs
-            for figi_infos in security.1.securities.values() {
-                for figi_info in figi_infos {
-                    if !figi_info.isin.is_empty() && !figi_info.ticker.is_empty() {
-                        isin_ticker_pairs.insert(figi_info.isin.clone(), figi_info.ticker.clone());
-                    }
+    // Step 6: Convert to simplified companies map and save incrementally
+    logger::log_info("Corporate Update: Building companies JSONL (streaming)...").await;
+    
+    let companies_path = paths.data_dir().join("companies.jsonl");
+    
+    // Create file and write incrementally
+    if let Some(parent) = companies_path.parent() {
+        tokio::fs::create_dir_all(parent).await?;
+    }
+    
+    let mut file = tokio::fs::File::create(&companies_path).await?;
+    let mut processed = 0;
+    
+    for (name, company_info) in common_stocks.iter() {
+        let mut isin_ticker_pairs: HashMap<String, String> = HashMap::new();
+        
+        for figi_infos in company_info.securities.values() {
+            for figi_info in figi_infos {
+                if !figi_info.isin.is_empty() && !figi_info.ticker.is_empty() {
+                    isin_ticker_pairs.insert(figi_info.isin.clone(), figi_info.ticker.clone());
                }
            }
+        }
+        
+        if !isin_ticker_pairs.is_empty() {
+            use tokio::io::AsyncWriteExt;
            
-            // Only add if there are pairs
-            if !isin_ticker_pairs.is_empty() {
-                acc.insert(security.1.name.clone(), isin_ticker_pairs);
+            let line = serde_json::json!({
+                "name": name,
+                "securities": isin_ticker_pairs
+            });
+            
+            file.write_all(line.to_string().as_bytes()).await?;
+            file.write_all(b"\n").await?;
+            processed += 1;
+            
+            // Yield periodically
+            if processed % 100 == 0 {
+                tokio::task::yield_now().await;
+                logger::log_info(&format!("Saved {} companies so far...", processed)).await;
            }
-            acc
-        });
+        }
+    }
    
-    logger::log_info(&format!("Corporate Update: Saving {} companies to JSONL", companies.len())).await;
-    save_companies_to_jsonl(&paths, &companies).await.expect("Failed to save companies List.");
-    logger::log_info("Corporate Update: Companies saved successfully").await;
+    logger::log_info(&format!("Corporate Update: Saved {} companies to JSONL", processed)).await;

-    // 5. Load existing earnings events (for change detection)
-    logger::log_info("Corporate Update: Loading existing events...").await;
-    let existing_events = match load_existing_events(&paths).await {
-        Ok(events) => {
-            let msg = format!("Corporate Update: Loaded {} existing events", events.len());
-            println!("{}", msg);
-            logger::log_info(&msg).await;
-            events
-        }
-        Err(e) => {
-            let msg = format!("Corporate Update: Warning - Could not load existing events: {}", e);
-            eprintln!("{}", msg);
-            logger::log_warn(&msg).await;
-            HashMap::new()
-        }
-    };
+    // Step 7: Process events in streaming fashion
+    logger::log_info("Corporate Update: Processing events (streaming)...").await;
+    
+    let event_index = build_event_index(&paths).await?;
+    logger::log_info(&format!("Corporate Update: Built index of {} events", event_index.len())).await;
+    
+    // For now, we just maintain the index
+    // In a full implementation, you'd stream through tickers and update events
+    
+    // Step 8: Save any updates
+    logger::log_info("Corporate Update: Finalizing...").await;

-    // 5. Use the provided pool (no need to create a new one)
-    let pool_size = pool.get_number_of_instances(); // Use the size from the shared pool
-    logger::log_info(&format!("Corporate Update: Using pool size: {}", pool_size)).await;
-
-    // Process companies in parallel using the shared pool
-    /*let results: Vec<_> = stream::iter(companies.into_iter())
-        .map(|company| {
-            let pool_clone = pool.clone();
-            async move {
-                process_company_data(&company, &pool_clone, &mut existing_events).await
-            }
-        })
-        .buffer_unordered(pool_size)
-        .collect().await;
-
-    // Handle results (e.g., collect changes)
-    let mut all_changes = Vec::new();
-    for result in results {
-        if let Ok(ProcessResult { changes }) = result {
-            all_changes.extend(changes);
-        }
-    }*/
-
-    logger::log_info(&format!("Corporate Update: Saving {} events to optimized storage", existing_events.len())).await;
-    save_optimized_events(&paths, existing_events).await?;
-    logger::log_info("Corporate Update: Events saved successfully").await;
-    //save_changes(&all_changes).await?;
-
-    let msg = "✓ Corporate update complete";
+    let msg = "✓ Corporate update complete (streaming)";
    println!("{}", msg);
    logger::log_info(msg).await;
    Ok(())
 }

+/// Helper to find the most recent date directory in the FIGI cache
+async fn find_most_recent_date_dir(map_cache_dir: &std::path::Path) -> anyhow::Result<Option<std::path::PathBuf>> {
+    if !map_cache_dir.exists() {
+        return Ok(None);
+    }
+    
+    let mut entries = tokio::fs::read_dir(map_cache_dir).await?;
+    let mut dates = Vec::new();
+    
+    while let Some(entry) = entries.next_entry().await? {
+        let path = entry.path();
+        if path.is_dir() {
+            if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
+                // Date format: DDMMYYYY
+                if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
+                    dates.push((name.to_string(), path));
+                }
+            }
+        }
+    }
+    
+    if dates.is_empty() {
+        return Ok(None);
+    }
+    
+    // Sort by date (DDMMYYYY format)
+    dates.sort_by(|a, b| b.0.cmp(&a.0)); // Descending order
+    
+    Ok(Some(dates[0].1.clone()))
+}
+
 pub struct ProcessResult {
    pub changes: Vec<CompanyEventChange>,
 }

+/// Process events in batches to avoid memory buildup
+pub async fn process_events_streaming(
+    index: &[EventIndex],
+    new_events: &[CompanyEvent],
+    today: &str,
+) -> anyhow::Result<(Vec<CompanyEventChange>, Vec<CompanyEvent>)> {
+    let mut all_changes = Vec::new();
+    let mut final_events: HashMap<String, CompanyEvent> = HashMap::new();
+    
+    // Step 1: Load existing events in batches using the index
+    logger::log_info("Loading existing events in batches...").await;
+    
+    let mut loaded_files = std::collections::HashSet::new();
+    
+    for entry in index {
+        if loaded_files.contains(&entry.file_path) {
+            continue;
+        }
+        
+        let content = tokio::fs::read_to_string(&entry.file_path).await?;
+        let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
+        
+        for e in events {
+            final_events.insert(event_key(&e), e);
+        }
+        
+        loaded_files.insert(entry.file_path.clone());
+        
+        if final_events.len() % 1000 == 0 {
+            logger::log_info(&format!("Loaded {} events so far...", final_events.len())).await;
+            tokio::task::yield_now().await;
+        }
+    }
+    
+    logger::log_info(&format!("Loaded {} existing events", final_events.len())).await;
+    
+    // Step 2: Process new events in batches
+    for (idx, batch) in new_events.chunks(500).enumerate() {
+        logger::log_info(&format!("Processing batch {} ({} events)", idx + 1, batch.len())).await;
+        
+        let batch_result = process_batch(batch, &mut final_events, today);
+        all_changes.extend(batch_result.changes);
+        
+        tokio::task::yield_now().await;
+    }
+    
+    let events_vec: Vec<CompanyEvent> = final_events.into_values().collect();
+    
+    Ok((all_changes, events_vec))
+}
+
 pub fn process_batch(
    new_events: &[CompanyEvent],
    existing: &mut HashMap<String, CompanyEvent>,