// src/corporate/openfigi.rs - STREAMING VERSION // Key changes: Never load entire GLEIF CSV or FIGI maps into memory use crate::util::directories::DataPaths; use crate::util::logger; use super::types::*; use reqwest::Client as HttpClient; use reqwest::header::{HeaderMap, HeaderValue}; use serde_json::{json, Value}; use std::collections::{HashMap, HashSet}; use std::path::Path; use std::io::{BufRead, BufReader}; use tokio::time::{sleep, Duration}; use tokio::fs as tokio_fs; use tokio::io::AsyncWriteExt; use anyhow::{Context, anyhow}; const LEI_BATCH_SIZE: usize = 100; // Process 100 LEIs at a time #[derive(Clone)] pub struct OpenFigiClient { client: HttpClient, has_key: bool, } impl OpenFigiClient { pub async fn new() -> anyhow::Result { let api_key = dotenvy::var("OPENFIGI_API_KEY").ok(); let has_key = api_key.is_some(); let mut builder = HttpClient::builder() .user_agent("Mozilla/5.0 (compatible; OpenFIGI-Rust/1.0)") .timeout(Duration::from_secs(30)); if let Some(key) = &api_key { let mut headers = HeaderMap::new(); headers.insert("X-OPENFIGI-APIKEY", HeaderValue::from_str(key)?); builder = builder.default_headers(headers); } let client = builder.build().context("Failed to build HTTP client")?; logger::log_info(&format!("OpenFIGI client: {}", if has_key { "with API key" } else { "no key" })).await; Ok(Self { client, has_key }) } pub async fn map_isins_to_figi_infos(&self, isins: &[String]) -> anyhow::Result> { if isins.is_empty() { return Ok(vec![]); } let mut all_figi_infos = Vec::new(); let chunk_size = if self.has_key { 100 } else { 5 }; let inter_sleep = if self.has_key { Duration::from_millis(240) } else { Duration::from_millis(2400) }; for chunk in isins.chunks(chunk_size) { let jobs: Vec = chunk.iter() .map(|isin| json!({ "idType": "ID_ISIN", "idValue": isin, })) .collect(); let mut retry_count = 0; let max_retries = 5; let mut backoff_ms = 1000u64; loop { let resp_result = self.client .post("https://api.openfigi.com/v3/mapping") .header("Content-Type", "application/json") .json(&jobs) .send() .await; let resp = match resp_result { Ok(r) => r, Err(e) => { retry_count += 1; if retry_count >= max_retries { let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e); logger::log_error(&err_msg).await; return Err(anyhow!(err_msg)); } let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e); logger::log_warn(&warn_msg).await; let retry_msg = format!(" Retrying in {}ms...", backoff_ms); logger::log_info(&retry_msg).await; sleep(Duration::from_millis(backoff_ms)).await; backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s continue; } }; let status = resp.status(); let headers = resp.headers().clone(); let body = resp.text().await?; if status == 429 { let reset_sec = headers .get("ratelimit-reset") .and_then(|v| v.to_str().ok()) .and_then(|s| s.parse::().ok()) .unwrap_or(10); sleep(Duration::from_secs(reset_sec.max(10))).await; continue; } else if !status.is_success() { if status.is_server_error() && retry_count < max_retries { retry_count += 1; sleep(Duration::from_millis(backoff_ms)).await; backoff_ms = (backoff_ms * 2).min(60000); continue; } return Err(anyhow!("OpenFIGI error {}: {}", status, body)); } let results: Vec = serde_json::from_str(&body)?; for (isin, result) in chunk.iter().zip(results) { if let Some(data) = result["data"].as_array() { for item in data { if let Some(figi) = item["figi"].as_str() { all_figi_infos.push(FigiInfo { isin: isin.clone(), figi: figi.to_string(), name: item["name"].as_str().unwrap_or("").to_string(), ticker: item["ticker"].as_str().unwrap_or("").to_string(), exch_code: item["exchCode"].as_str().unwrap_or("").to_string(), composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(), security_type: item["securityType"].as_str().unwrap_or("").to_string(), market_sector: item["marketSector"].as_str().unwrap_or("").to_string(), share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(), security_type2: item["securityType2"].as_str().unwrap_or("").to_string(), security_description: item["securityDescription"].as_str().unwrap_or("").to_string(), }); } } } } break; } sleep(inter_sleep).await; } Ok(all_figi_infos) } } async fn process_and_save_figi_batch( client: &OpenFigiClient, lei_batch: &HashMap>, date_dir: &Path, ) -> anyhow::Result<()> { for (lei, isins) in lei_batch { let unique_isins: Vec<_> = isins.iter() .cloned() .collect::>() .into_iter() .collect(); let figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?; if figi_infos.is_empty() { // No FIGIs found - save to no_results.jsonl to avoid re-querying append_no_result_lei(date_dir, lei, &unique_isins).await?; continue; } // Save FIGIs by sector as before save_figi_infos_by_sector(lei, &figi_infos, date_dir).await?; } Ok(()) } async fn save_figi_infos_by_sector( lei: &str, figi_infos: &[FigiInfo], date_dir: &Path, ) -> anyhow::Result<()> { let mut by_sector: HashMap> = HashMap::new(); for figi_info in figi_infos { let sector = if figi_info.market_sector.is_empty() { "uncategorized".to_string() } else { figi_info.market_sector.clone() }; by_sector.entry(sector).or_default().push(figi_info.clone()); } for (sector, figis) in by_sector { let sector_dir = date_dir.join(§or); let path = sector_dir.join("lei_to_figi.jsonl"); append_lei_to_figi_jsonl(&path, lei, &figis).await?; } Ok(()) } async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> { let entry = json!({ "lei": lei, "figis": figis, }); let line = serde_json::to_string(&entry)? + "\n"; let mut file = tokio_fs::OpenOptions::new() .create(true) .append(true) .open(path) .await?; file.write_all(line.as_bytes()).await?; Ok(()) } /// STREAMING: Build securities without loading everything into memory pub async fn build_securities_from_figi_streaming( date_dir: &Path, ) -> anyhow::Result<()> { logger::log_info("Building securities (streaming mode)...").await; // Load existing incrementally let mut commons = load_from_cache_if_exists::>( "data/corporate/by_name/common_stocks.json" ).await?; let equity_file = date_dir.join("Equity").join("lei_to_figi.jsonl"); if !equity_file.exists() { logger::log_warn("No Equity FIGI file found").await; return Ok(()); } let content = tokio_fs::read_to_string(&equity_file).await?; let mut processed = 0; let mut stats = ProcessingStats::new(commons.len(), 0, 0); for line in content.lines() { if line.trim().is_empty() { continue; } let entry: Value = serde_json::from_str(line)?; let figi_infos: Vec = serde_json::from_value(entry["figis"].clone())?; // Process only common stocks let common_stocks: Vec<_> = figi_infos.iter() .filter(|f| f.security_type == "Common Stock") .cloned() .collect(); if !common_stocks.is_empty() { process_common_stocks(&mut commons, &common_stocks, &mut stats); } processed += 1; if processed % 100 == 0 { tokio::task::yield_now().await; } } logger::log_info(&format!("Processed {} FIGI entries", processed)).await; save_to_cache("data/corporate/by_name/common_stocks.json", &commons).await?; Ok(()) } /// Handles rate limit responses from the OpenFIGI API. /// /// If a 429 status is received, this function sleeps for the duration specified /// in the `ratelimit-reset` header (or 10 seconds by default). /// /// # Arguments /// * `resp` - The HTTP response to check. /// /// # Returns /// Ok(()) if no rate limit, or after waiting for the reset period. /// /// # Errors /// Returns an error if the response status indicates a non-rate-limit error. async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> { let status = resp.status(); if status == 429 { let headers = resp.headers(); let reset_sec = headers .get("ratelimit-reset") .and_then(|v| v.to_str().ok()) .and_then(|s| s.parse::().ok()) .unwrap_or(10); logger::log_info(&format!(" Rate limited—waiting {}s", reset_sec)).await; sleep(std::time::Duration::from_secs(reset_sec.max(10))).await; return Err(anyhow!("Rate limited, please retry")); } else if status.is_client_error() || status.is_server_error() { return Err(anyhow!("OpenFIGI API error: {}", status)); } Ok(()) } fn process_common_stocks( companies: &mut HashMap, figi_infos: &[FigiInfo], stats: &mut ProcessingStats, ) { let name = figi_infos[0].name.clone(); if name.is_empty() { return; } let grouped_by_isin = group_by_isin(figi_infos); if let Some(existing) = companies.get_mut(&name) { let mut updated = false; for (isin, new_figis) in grouped_by_isin { if let Some(existing_figis) = existing.securities.get_mut(&isin) { let merged = merge_figi_list(existing_figis, &new_figis); if merged.len() > existing_figis.len() { *existing_figis = merged; updated = true; } } else { existing.securities.insert(isin.clone(), new_figis); updated = true; } } if existing.primary_isin.is_empty() { if let Some(first_isin) = existing.securities.keys().next() { existing.primary_isin = first_isin.clone(); } } if updated { stats.companies_updated += 1; } } else { let primary_isin = grouped_by_isin.keys().next().cloned().unwrap_or_default(); companies.insert(name.clone(), CompanyInfo { name, primary_isin, securities: grouped_by_isin, }); stats.companies_added += 1; } } fn group_by_isin(figi_infos: &[FigiInfo]) -> HashMap> { let mut grouped: HashMap> = HashMap::new(); for figi_info in figi_infos { grouped.entry(figi_info.isin.clone()) .or_insert_with(Vec::new) .push(figi_info.clone()); } for figis in grouped.values_mut() { figis.sort_by(|a, b| a.figi.cmp(&b.figi)); } grouped } fn merge_figi_list(existing: &[FigiInfo], new_figis: &[FigiInfo]) -> Vec { let mut merged = existing.to_vec(); let existing_figis: HashSet = existing.iter() .map(|f| f.figi.clone()) .collect(); for new_figi in new_figis { if !existing_figis.contains(&new_figi.figi) { merged.push(new_figi.clone()); } } merged.sort_by(|a, b| a.figi.cmp(&b.figi)); merged } #[derive(Debug)] struct ProcessingStats { initial_companies: usize, companies_added: usize, companies_updated: usize, } impl ProcessingStats { fn new(companies: usize, _warrants: usize, _options: usize) -> Self { Self { initial_companies: companies, companies_added: 0, companies_updated: 0, } } } async fn load_from_cache_if_exists(path: &str) -> anyhow::Result where T: serde::de::DeserializeOwned + Default, { let cache_file = Path::new(path); if !cache_file.exists() { return Ok(T::default()); } let content = tokio_fs::read_to_string(cache_file).await?; Ok(serde_json::from_str(&content)?) } async fn save_to_cache(path: &str, data: &T) -> anyhow::Result<()> where T: serde::Serialize, { let cache_path = Path::new(path); let cache_dir = cache_path.parent().context("Invalid path")?; tokio_fs::create_dir_all(cache_dir).await?; let json_str = serde_json::to_string_pretty(data)?; tokio_fs::write(cache_path, json_str).await?; Ok(()) } async fn load_market_sectors() -> anyhow::Result> { let dir = DataPaths::new(".")?; let cache_file = dir.cache_openfigi_dir().join("marketSecDes.json"); if !cache_file.exists() { return Ok(vec![ "Comdty".to_string(), "Corp".to_string(), "Equity".to_string(), "Govt".to_string(), ]); } let content = tokio_fs::read_to_string(&cache_file).await?; let json: Value = serde_json::from_str(&content)?; let sectors: Vec = json["values"] .as_array() .ok_or_else(|| anyhow!("No values"))? .iter() .filter_map(|v| v.as_str().map(|s| s.to_string())) .collect(); Ok(sectors) } async fn determine_gleif_date( gleif_date: Option<&str>, paths: &DataPaths, ) -> anyhow::Result { if let Some(d) = gleif_date { return Ok(d.to_string()); } let gleif_dir = paths.cache_gleif_dir(); let mut entries = tokio_fs::read_dir(gleif_dir).await?; let mut dates = Vec::new(); while let Some(entry) = entries.next_entry().await? { let path = entry.path(); if path.is_dir() { if let Some(name) = path.file_name().and_then(|n| n.to_str()) { if name.len() == 8 && name.chars().all(|c| c.is_numeric()) { dates.push(name.to_string()); } } } } dates.sort(); dates.last().cloned().ok_or_else(|| anyhow!("No GLEIF date found")) } async fn setup_sector_directories( date_dir: &Path, sector_dirs: &[String], ) -> anyhow::Result<()> { let uncategorized_dir = date_dir.join("uncategorized"); tokio_fs::create_dir_all(&uncategorized_dir).await?; for sector in sector_dirs { let sector_dir = date_dir.join(sector); tokio_fs::create_dir_all(§or_dir).await?; } Ok(()) } /// Loads all OpenFIGI mapping value lists (marketSecDes, micCode, securityType). /// /// This function fetches the available values for each mapping parameter from the OpenFIGI API /// and caches them as JSON files in `data/openfigi/`. If the files already exist and are recent /// (less than 30 days old), they are reused instead of re-fetching. /// /// # Returns /// Ok(()) on success. /// /// # Errors /// Returns an error if API requests fail, JSON parsing fails, or file I/O fails. pub async fn load_figi_type_lists() -> anyhow::Result<()> { logger::log_info("Loading OpenFIGI mapping value lists...").await; let client = OpenFigiClient::new().await?; // Create cache directory let dir = DataPaths::new(".")?; let cache_dir = dir.cache_openfigi_dir(); tokio_fs::create_dir_all(cache_dir).await .context("Failed to create data/openfigi directory")?; // Fetch each type list get_figi_market_sec_des(&client, cache_dir).await?; get_figi_mic_code(&client, cache_dir).await?; get_figi_security_type(&client, cache_dir).await?; logger::log_info("OpenFIGI mapping value lists loaded successfully").await; Ok(()) } /// Fetches and caches the list of valid marketSecDes values. /// /// # Arguments /// * `client` - The OpenFIGI client instance. /// * `cache_dir` - Directory to save the cached JSON file. /// /// # Returns /// Ok(()) on success. /// /// # Errors /// Returns an error if the API request fails or file I/O fails. async fn get_figi_market_sec_des(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> { let cache_file = cache_dir.join("marketSecDes.json"); // Check if cache exists and is recent (< 30 days old) if should_use_cache(&cache_file).await? { logger::log_info(" Using cached marketSecDes values").await; return Ok(()); } logger::log_info(" Fetching marketSecDes values from OpenFIGI API...").await; let resp = client.client .get("https://api.openfigi.com/v3/mapping/values/marketSecDes") .send() .await .context("Failed to fetch marketSecDes values")?; handle_rate_limit(&resp).await?; let values: Value = resp.json().await .context("Failed to parse marketSecDes response")?; // Save to cache let json_str = serde_json::to_string_pretty(&values)?; tokio_fs::write(&cache_file, json_str).await .context("Failed to write marketSecDes cache")?; logger::log_info(" ✓ Cached marketSecDes values").await; // Respect rate limits sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await; Ok(()) } /// Fetches and caches the list of valid micCode values. /// /// # Arguments /// * `client` - The OpenFIGI client instance. /// * `cache_dir` - Directory to save the cached JSON file. /// /// # Returns /// Ok(()) on success. /// /// # Errors /// Returns an error if the API request fails or file I/O fails. async fn get_figi_mic_code(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> { let cache_file = cache_dir.join("micCode.json"); if should_use_cache(&cache_file).await? { logger::log_info(" Using cached micCode values").await; return Ok(()); } logger::log_info(" Fetching micCode values from OpenFIGI API...").await; let resp = client.client .get("https://api.openfigi.com/v3/mapping/values/micCode") .send() .await .context("Failed to fetch micCode values")?; handle_rate_limit(&resp).await?; let values: Value = resp.json().await .context("Failed to parse micCode response")?; let json_str = serde_json::to_string_pretty(&values)?; tokio_fs::write(&cache_file, json_str).await .context("Failed to write micCode cache")?; logger::log_info(" ✓ Cached micCode values").await; sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await; Ok(()) } /// Checks if a cache file exists and is less than 30 days old. /// /// # Arguments /// * `path` - Path to the cache file. /// /// # Returns /// True if the cache should be used, false if it needs refreshing. async fn should_use_cache(path: &Path) -> anyhow::Result { if !path.exists() { return Ok(false); } let metadata = tokio_fs::metadata(path).await?; let modified = metadata.modified()?; let age = modified.elapsed().unwrap_or(std::time::Duration::from_secs(u64::MAX)); // Cache is valid for 30 days Ok(age < std::time::Duration::from_secs(30 * 24 * 60 * 60)) } /// Fetches and caches the list of valid securityType values. /// /// # Arguments /// * `client` - The OpenFIGI client instance. /// * `cache_dir` - Directory to save the cached JSON file. /// /// # Returns /// Ok(()) on success. /// /// # Errors /// Returns an error if the API request fails or file I/O fails. async fn get_figi_security_type(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> { let cache_file = cache_dir.join("securityType.json"); if should_use_cache(&cache_file).await? { logger::log_info(" Using cached securityType values").await; return Ok(()); } logger::log_info(" Fetching securityType values from OpenFIGI API...").await; let resp = client.client .get("https://api.openfigi.com/v3/mapping/values/securityType") .send() .await .context("Failed to fetch securityType values")?; handle_rate_limit(&resp).await?; let values: Value = resp.json().await .context("Failed to parse securityType response")?; let json_str = serde_json::to_string_pretty(&values)?; tokio_fs::write(&cache_file, json_str).await .context("Failed to write securityType cache")?; logger::log_info(" ✓ Cached securityType values").await; sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await; Ok(()) } #[derive(Debug)] pub struct MappingStats { pub total_leis: usize, pub mapped_leis: usize, pub no_result_leis: usize, pub unqueried_leis: usize, pub mapping_percentage: f64, pub queried_percentage: f64, pub by_sector: HashMap, } /// Get detailed statistics about LEI-FIGI mapping status pub async fn get_mapping_stats( csv_path: &str, gleif_date: Option<&str>, ) -> anyhow::Result { let dir = DataPaths::new(".")?; let map_cache_dir = dir.cache_gleif_openfigi_map_dir(); let date = determine_gleif_date(gleif_date, &dir).await?; let date_dir = map_cache_dir.join(&date); let all_leis = get_all_leis_from_gleif(csv_path).await?; let mapped_leis = load_existing_mapped_leis(&date_dir).await?; let no_result_leis = load_no_result_leis(&date_dir).await?; let total = all_leis.len(); let mapped = mapped_leis.len(); let no_results = no_result_leis.len(); let queried = mapped + no_results; let unqueried = total.saturating_sub(queried); let mapping_percentage = if total > 0 { (mapped as f64 / total as f64) * 100.0 } else { 0.0 }; let queried_percentage = if total > 0 { (queried as f64 / total as f64) * 100.0 } else { 0.0 }; // Count by sector let mut by_sector = HashMap::new(); if date_dir.exists() { let mut entries = tokio_fs::read_dir(&date_dir).await?; while let Some(entry) = entries.next_entry().await? { let sector_path = entry.path(); if !sector_path.is_dir() { continue; } let sector_name = sector_path .file_name() .and_then(|n| n.to_str()) .unwrap_or("unknown") .to_string(); let jsonl_path = sector_path.join("lei_to_figi.jsonl"); if !jsonl_path.exists() { continue; } let content = tokio_fs::read_to_string(&jsonl_path).await?; let count = content.lines().filter(|l| !l.trim().is_empty()).count(); by_sector.insert(sector_name, count); } } Ok(MappingStats { total_leis: total, mapped_leis: mapped, no_result_leis: no_results, unqueried_leis: unqueried, mapping_percentage, queried_percentage, by_sector, }) } /// Print mapping statistics to console and logs pub async fn print_mapping_stats(csv_path: &str) -> anyhow::Result<()> { logger::log_info("=== LEI-FIGI Mapping Status ===").await; let stats = get_mapping_stats(csv_path, None).await?; logger::log_info(&format!( "Total LEIs: {}", stats.total_leis )).await; logger::log_info(&format!( "├─ Mapped (with FIGI): {} ({:.2}%)", stats.mapped_leis, stats.mapping_percentage )).await; logger::log_info(&format!( "├─ No Results (queried, no FIGI): {} ({:.2}%)", stats.no_result_leis, (stats.no_result_leis as f64 / stats.total_leis as f64) * 100.0 )).await; logger::log_info(&format!( "└─ Not Queried Yet: {} ({:.2}%)", stats.unqueried_leis, (stats.unqueried_leis as f64 / stats.total_leis as f64) * 100.0 )).await; logger::log_info(&format!( "\nQuery Coverage: {:.2}% ({} / {})", stats.queried_percentage, stats.mapped_leis + stats.no_result_leis, stats.total_leis )).await; if !stats.by_sector.is_empty() { logger::log_info("\nMapped LEIs by sector:").await; let mut sectors: Vec<_> = stats.by_sector.iter().collect(); sectors.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending for (sector, count) in sectors { logger::log_info(&format!(" {}: {}", sector, count)).await; } } logger::log_info("==============================").await; Ok(()) } /// Quick check if mapping is complete (returns true if all mapped) pub async fn is_mapping_complete(csv_path: &str) -> anyhow::Result { let dir = DataPaths::new(".")?; let map_cache_dir = dir.cache_gleif_openfigi_map_dir(); let date = determine_gleif_date(None, &dir).await?; let date_dir = map_cache_dir.join(&date); let unmapped = get_unmapped_leis(csv_path, &date_dir).await?; Ok(unmapped.is_empty()) } /// Load all LEIs that have already been mapped from existing JSONL files async fn load_existing_mapped_leis(date_dir: &Path) -> anyhow::Result> { let mut mapped_leis = HashSet::new(); if !date_dir.exists() { return Ok(mapped_leis); } // Read all sector directories let mut entries = tokio_fs::read_dir(date_dir).await?; while let Some(entry) = entries.next_entry().await? { let sector_path = entry.path(); if !sector_path.is_dir() { continue; } let jsonl_path = sector_path.join("lei_to_figi.jsonl"); if !jsonl_path.exists() { continue; } // Read JSONL file line by line let content = tokio_fs::read_to_string(&jsonl_path).await?; for line in content.lines() { if line.trim().is_empty() { continue; } if let Ok(entry) = serde_json::from_str::(line) { if let Some(lei) = entry["lei"].as_str() { mapped_leis.insert(lei.to_string()); } } } } if !mapped_leis.is_empty() { logger::log_info(&format!("Found {} already mapped LEIs", mapped_leis.len())).await; } Ok(mapped_leis) } /// Read GLEIF CSV and return all LEIs (without loading entire file into memory) async fn get_all_leis_from_gleif(csv_path: &str) -> anyhow::Result> { let file = std::fs::File::open(csv_path)?; let reader = BufReader::new(file); let mut all_leis = HashSet::new(); for (idx, line) in reader.lines().enumerate() { if idx == 0 { continue; // Skip header } let line = line?; let parts: Vec<&str> = line.split(',').collect(); if parts.len() < 2 { continue; } let lei = parts[0].trim().trim_matches('"').to_string(); if !lei.is_empty() { all_leis.insert(lei); } } logger::log_info(&format!("Found {} total LEIs in GLEIF CSV", all_leis.len())).await; Ok(all_leis) } /// Get unmapped LEIs by comparing GLEIF CSV with existing mappings async fn get_unmapped_leis( csv_path: &str, date_dir: &Path, ) -> anyhow::Result> { let all_leis = get_all_leis_from_gleif(csv_path).await?; let mapped_leis = load_existing_mapped_leis(date_dir).await?; let no_result_leis = load_no_result_leis(date_dir).await?; // Calculate truly unmapped: all - (mapped + no_results) let queried_leis: HashSet = mapped_leis .union(&no_result_leis) .cloned() .collect(); let unmapped: HashSet = all_leis .difference(&queried_leis) .cloned() .collect(); let total = all_leis.len(); let mapped = mapped_leis.len(); let no_results = no_result_leis.len(); let unqueried = unmapped.len(); logger::log_info(&format!( "LEI Status: Total={}, Mapped={}, No Results={}, Unqueried={}", total, mapped, no_results, unqueried )).await; Ok(unmapped) } /// Modified version that only processes specified LEIs pub async fn stream_gleif_csv_and_build_figi_filtered( csv_path: &str, gleif_date: Option<&str>, filter_leis: Option<&HashSet>, ) -> anyhow::Result<()> { logger::log_info(&format!("Streaming GLEIF CSV: {}", csv_path)).await; let file = std::fs::File::open(csv_path)?; let reader = BufReader::new(file); let client = OpenFigiClient::new().await?; if !client.has_key { logger::log_warn("No API key - skipping FIGI mapping").await; return Ok(()); } let dir = DataPaths::new(".")?; let map_cache_dir = dir.cache_gleif_openfigi_map_dir(); let date = determine_gleif_date(gleif_date, &dir).await?; let date_dir = map_cache_dir.join(&date); tokio_fs::create_dir_all(&date_dir).await?; let sector_dirs = load_market_sectors().await?; setup_sector_directories(&date_dir, §or_dirs).await?; let mut lei_batch: HashMap> = HashMap::new(); let mut line_count = 0; let mut processed_leis = 0; let mut skipped_leis = 0; for (idx, line) in reader.lines().enumerate() { let line = line?; if idx == 0 { continue; } let parts: Vec<&str> = line.split(',').collect(); if parts.len() < 2 { continue; } let lei = parts[0].trim().trim_matches('"').to_string(); let isin = parts[1].trim().trim_matches('"').to_string(); if lei.is_empty() || isin.is_empty() { continue; } // Apply filter if provided if let Some(filter) = filter_leis { if !filter.contains(&lei) { skipped_leis += 1; continue; } } lei_batch.entry(lei).or_default().push(isin); line_count += 1; // Process batch when full if lei_batch.len() >= LEI_BATCH_SIZE { process_and_save_figi_batch(&client, &lei_batch, &date_dir).await?; processed_leis += lei_batch.len(); if processed_leis % 1000 == 0 { logger::log_info(&format!("Queried {} LEIs...", processed_leis)).await; } lei_batch.clear(); tokio::task::yield_now().await; } } // Process remaining if !lei_batch.is_empty() { process_and_save_figi_batch(&client, &lei_batch, &date_dir).await?; processed_leis += lei_batch.len(); } logger::log_info(&format!( "✓ Queried {} LEIs, skipped {} already processed", processed_leis, skipped_leis )).await; Ok(()) } /// Check mapping completion and process only unmapped LEIs pub async fn ensure_all_leis_mapped( csv_path: &str, gleif_date: Option<&str>, ) -> anyhow::Result { let dir = DataPaths::new(".")?; let map_cache_dir = dir.cache_gleif_openfigi_map_dir(); let date = determine_gleif_date(gleif_date, &dir).await?; let date_dir = map_cache_dir.join(&date); // Get unmapped LEIs (excludes both mapped and no-result LEIs) let unmapped = get_unmapped_leis(csv_path, &date_dir).await?; if unmapped.is_empty() { logger::log_info("✓ All LEIs have been queried (mapped or confirmed no results)").await; return Ok(true); } logger::log_info(&format!("Found {} LEIs that need querying - starting mapping...", unmapped.len())).await; // Process only unmapped LEIs stream_gleif_csv_and_build_figi_filtered(csv_path, gleif_date, Some(&unmapped)).await?; // Verify completion let still_unmapped = get_unmapped_leis(csv_path, &date_dir).await?; if still_unmapped.is_empty() { logger::log_info("✓ All LEIs successfully queried").await; Ok(true) } else { logger::log_warn(&format!( "⚠ {} LEIs still unqueried (API errors or rate limits)", still_unmapped.len() )).await; Ok(false) } } /// Load LEIs that were queried but returned no results async fn load_no_result_leis(date_dir: &Path) -> anyhow::Result> { let mut no_result_leis = HashSet::new(); let no_results_path = date_dir.join("no_results.jsonl"); if !no_results_path.exists() { return Ok(no_result_leis); } let content = tokio_fs::read_to_string(&no_results_path).await?; for line in content.lines() { if line.trim().is_empty() { continue; } if let Ok(entry) = serde_json::from_str::(line) { if let Some(lei) = entry["lei"].as_str() { no_result_leis.insert(lei.to_string()); } } } if !no_result_leis.is_empty() { logger::log_info(&format!( "Found {} LEIs previously queried with no FIGI results", no_result_leis.len() )).await; } Ok(no_result_leis) } /// Save LEI that was queried but returned no results async fn append_no_result_lei(date_dir: &Path, lei: &str, isins: &[String]) -> anyhow::Result<()> { let no_results_path = date_dir.join("no_results.jsonl"); let entry = json!({ "lei": lei, "isins": isins, "queried_at": chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(), }); let line = serde_json::to_string(&entry)? + "\n"; let mut file = tokio_fs::OpenOptions::new() .create(true) .append(true) .open(&no_results_path) .await?; file.write_all(line.as_bytes()).await?; Ok(()) }