removing not map-able LEIs

This commit is contained in:
2025-12-07 17:38:32 +01:00
parent 0f89c8c0ce
commit c00bfd8687
3 changed files with 308 additions and 48 deletions

View File

@@ -1,4 +1,5 @@
use crate::util::directories::DataPaths;
use crate::util::logger;
// src/corporate/openfigi.rs
use super::{types::*};
@@ -27,7 +28,7 @@ impl OpenFigiClient {
/// # Errors
///
/// Returns an error if the HTTP client cannot be built or if the API key header is invalid.
pub fn new() -> anyhow::Result<Self> {
pub async fn new() -> anyhow::Result<Self> {
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
let has_key = api_key.is_some();
@@ -43,10 +44,11 @@ impl OpenFigiClient {
let client = builder.build().context("Failed to build HTTP client")?;
println!(
let msg = format!(
"OpenFIGI client initialized: {}",
if has_key { "with API key" } else { "no key (limited mode)" }
);
logger::log_info(&msg).await;
Ok(Self { client, has_key })
}
@@ -126,10 +128,16 @@ impl OpenFigiClient {
Err(e) => {
retry_count += 1;
if retry_count >= max_retries {
return Err(anyhow!("Failed to send mapping request after {} retries: {}", max_retries, e));
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
logger::log_error(&err_msg).await;
return Err(anyhow!(err_msg));
}
eprintln!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
println!(" Retrying in {}ms...", backoff_ms);
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
println!("{}", retry_msg);
logger::log_info(&retry_msg).await;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
continue;
@@ -147,7 +155,9 @@ impl OpenFigiClient {
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
println!("Rate limited—backing off {}s", reset_sec);
let rate_msg = format!("Rate limited—backing off {}s", reset_sec);
println!("{}", rate_msg);
logger::log_warn(&rate_msg).await;
sleep(Duration::from_secs(reset_sec.max(10))).await;
continue; // Retry the same chunk
} else if status == 401 {
@@ -158,9 +168,13 @@ impl OpenFigiClient {
// Transient server error, retry with backoff
retry_count += 1;
if retry_count >= max_retries {
return Err(anyhow!("OpenFIGI server error {} after {} retries: {}", status, max_retries, body));
let err_msg = format!("OpenFIGI server error {} after {} retries: {}", status, max_retries, body);
logger::log_error(&err_msg).await;
return Err(anyhow!(err_msg));
}
eprintln!("Server error {} (attempt {}/{}), retrying in {}ms...", status, retry_count, max_retries, backoff_ms);
let warn_msg = format!("Server error {} (attempt {}/{}), retrying in {}ms...", status, retry_count, max_retries, backoff_ms);
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000);
continue;
@@ -260,7 +274,9 @@ async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
if !cache_file.exists() {
// Return default if file doesn't exist
eprintln!("Warning: {} not found, using default sectors", cache_file.display());
let warn_msg = format!("Warning: {} not found, using default sectors", cache_file.display());
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
return Ok(vec![
"Comdty".to_string(),
"Corp".to_string(),
@@ -292,7 +308,8 @@ async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
return Err(anyhow!("No sectors found in marketSecDes.json"));
}
println!("Loaded {} market sectors from cache", sectors.len());
let msg = format!("Loaded {} market sectors from cache", sectors.len());
logger::log_info(&msg).await;
Ok(sectors)
}
@@ -328,7 +345,9 @@ async fn find_most_recent_gleif_date(gleif_cache_dir: &Path) -> anyhow::Result<O
let most_recent = &csv_files[0];
let date = extract_gleif_date_from_filename(most_recent);
println!(" Found GLEIF data dated: {}", date);
let msg = format!(" Found GLEIF data dated: {}", date);
logger::log_info(&msg).await;
Ok(Some(date))
}
@@ -359,7 +378,9 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
match build_lei_to_figi_infos_internal(lei_to_isins, gleif_date).await {
Ok(map) => {
if !map.is_empty() {
println!("✓ LEI→FIGI mapping completed successfully with {} entries", map.len());
let msg = format!("✓ LEI→FIGI mapping completed successfully with {} entries", map.len());
logger::log_info(&msg).await;
}
return Ok(map);
}
@@ -372,19 +393,27 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
|| error_msg.contains("Failed to create");
if is_fatal {
eprintln!("Fatal error in LEI→FIGI mapping: {}", e);
let err = format!("Fatal error in LEI→FIGI mapping: {}", e);
eprintln!("{}", err);
logger::log_error(&err).await;
return Err(e);
}
retry_count += 1;
if retry_count >= max_retries {
eprintln!("LEI→FIGI mapping failed after {} retries: {}", max_retries, e);
let err = format!("LEI→FIGI mapping failed after {} retries: {}", max_retries, e);
eprintln!("{}", err);
logger::log_error(&err).await;
return Err(e);
}
let wait_secs = 60 * retry_count;
eprintln!("Transient error in LEI→FIGI mapping (attempt {}/{}): {}", retry_count, max_retries, e);
println!("Retrying mapping in {}s...", wait_secs);
let warn_msg = format!("Transient error in LEI→FIGI mapping (attempt {}/{}): {}", retry_count, max_retries, e);
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
let retry_msg = format!("Retrying mapping in {}s...", wait_secs);
println!("{}", retry_msg);
logger::log_info(&retry_msg).await;
sleep(Duration::from_secs(wait_secs as u64)).await;
}
}
@@ -396,6 +425,11 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
/// This is the actual worker function that performs the mapping. It handles already-processed
/// LEIs gracefully but will fail on transient errors, which are caught and retried by the
/// wrapper function build_lei_to_figi_infos.
///
/// Tracks three outcomes:
/// 1. Hit with marketSector: saved to sector-specific folder
/// 2. Hit without marketSector: saved to "uncategorized" folder
/// 3. No_hit (empty results): LEI marked for removal from GLEIF CSV
async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<String>>, gleif_date: Option<&str>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
let dir = DataPaths::new(".")?;
let gleif_cache_dir = dir.cache_gleif_dir();
@@ -403,23 +437,42 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
// Determine the GLEIF date to use
let date = if let Some(d) = gleif_date {
let msg = format!("Using provided GLEIF date: {}", d);
logger::log_info(&msg).await;
d.to_string()
} else {
// Find the most recent GLEIF file
logger::log_info("Searching for most recent GLEIF file...").await;
match find_most_recent_gleif_date(gleif_cache_dir).await? {
Some(d) => d,
None => return Err(anyhow!("No GLEIF CSV file found in cache/gleif directory")),
None => {
let err = "No GLEIF CSV file found in cache/gleif directory";
logger::log_error(err).await;
return Err(anyhow!(err));
},
}
};
// Create date-based subdirectory in the mapping cache
let msg = format!("Creating date directory for: {}", date);
logger::log_info(&msg).await;
let date_dir = map_cache_dir.join(&date);
tokio_fs::create_dir_all(&date_dir).await.context("Failed to create date directory")?;
// Load market sectors dynamically from cache
logger::log_info("Loading market sectors...").await;
let sector_dirs = load_market_sectors().await?;
let mut sector_maps: HashMap<String, HashMap<String, Vec<FigiInfo>>> = HashMap::new();
// Create uncategorized folder
let msg = format!("Creating {} sector directories...", sector_dirs.len());
logger::log_info(&msg).await;
let uncategorized_dir = date_dir.join("uncategorized");
tokio_fs::create_dir_all(&uncategorized_dir).await.context("Failed to create uncategorized directory")?;
let uncategorized_path = uncategorized_dir.join("lei_to_figi.jsonl");
let uncategorized_map = load_lei_to_figi_jsonl(&uncategorized_path).await?;
sector_maps.insert("uncategorized".to_string(), uncategorized_map);
for sector in &sector_dirs {
let sector_dir = date_dir.join(sector);
tokio_fs::create_dir_all(&sector_dir).await.context("Failed to create sector directory")?;
@@ -430,22 +483,30 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
sector_maps.insert(sector.clone(), lei_map);
}
let client = OpenFigiClient::new()?;
let client = OpenFigiClient::new().await?;
if !client.has_key {
let total_entries: usize = sector_maps.values().map(|m| m.len()).sum();
println!("No API key—using partial LEI→FIGI maps with {} total entries", total_entries);
let msg = format!("No API key—using partial LEI→FIGI maps with {} total entries", total_entries);
logger::log_warn(&msg).await;
return Ok(sector_maps.get("Equity").cloned().unwrap_or_default());
}
// Sort LEIs for deterministic processing order
logger::log_info("Starting LEI→FIGI mapping process...").await;
let mut leis: Vec<_> = lei_to_isins.keys().cloned().collect();
leis.sort();
let mut processed = sector_maps.values().map(|m| m.len()).sum::<usize>();
let total = leis.len();
let mut no_hit_leis = Vec::new(); // Track LEIs with no data found (no_hit)
let msg = format!("Total LEIs to process: {}, already processed: {}", total, processed);
logger::log_info(&msg).await;
for lei in leis {
// Check if LEI is already processed in any sector
// Check if LEI is already processed in any sector (including uncategorized)
let mut already_processed = false;
for sector_map in sector_maps.values() {
if sector_map.contains_key(&lei) {
@@ -464,18 +525,57 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
};
let unique_isins: Vec<_> = isins.iter().cloned().collect::<HashSet<_>>().into_iter().collect();
let debug_msg = format!("Processing LEI {} with {} ISINs...", lei, unique_isins.len());
logger::log_info(&debug_msg).await;
let all_figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
// Case 1: no_hit - API succeeded but returned no data
if all_figi_infos.is_empty() {
let no_hit_msg = format!(" no_hit: LEI {} returned no FIGIs", lei);
logger::log_warn(&no_hit_msg).await;
no_hit_leis.push(lei.clone());
// Remove immediately from GLEIF CSV to prevent progress loss on interrupt
if let Err(e) = remove_lei_from_gleif_csv_single(gleif_cache_dir, &lei).await {
let warn_msg = format!("Warning: Failed to remove LEI {} from GLEIF CSV: {}", lei, e);
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
}
continue;
}
let hit_msg = format!(" hit: LEI {} found {} FIGIs", lei, all_figi_infos.len());
logger::log_info(&hit_msg).await;
// Organize results by marketSector
let mut figis_by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
let mut uncategorized_figis = Vec::new();
for figi_info in all_figi_infos {
let sector = figi_info.marketSector.clone();
if sector.is_empty() {
continue; // Skip if no sector
}
figis_by_sector.entry(sector).or_insert_with(Vec::new).push(figi_info);
if sector.is_empty() {
// Case 2: Hit but no marketSecDes - save to uncategorized
uncategorized_figis.push(figi_info);
} else {
// Case 1: Hit with marketSector - organize by sector
figis_by_sector.entry(sector).or_insert_with(Vec::new).push(figi_info);
}
}
// Save uncategorized FIGIs if any
if !uncategorized_figis.is_empty() {
uncategorized_figis.sort_by_key(|f| f.figi.clone());
uncategorized_figis.dedup_by_key(|f| f.figi.clone());
append_lei_to_figi_jsonl(&uncategorized_path, &lei, &uncategorized_figis).await
.context("Failed to append to uncategorized JSONL")?;
if let Some(uncategorized_map) = sector_maps.get_mut("uncategorized") {
uncategorized_map.insert(lei.clone(), uncategorized_figis);
}
}
// Save to appropriate sector files
@@ -502,21 +602,19 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
let count = sector_maps.get(s).map(|m| m.len()).unwrap_or(0);
format!("{}:{}", s, count)
}).collect();
println!("Processed {}/{} LEIs → [{}]", processed, total, totals.join(", "));
let progress_msg = format!("Processed {}/{} LEIs → [{}] no_hit: {}", processed, total, totals.join(", "), no_hit_leis.len());
println!("{}", progress_msg);
logger::log_info(&progress_msg).await;
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
// Print final summary
println!("\n=== LEI→FIGI Mapping Complete ===");
for sector in &sector_dirs {
if let Some(sector_map) = sector_maps.get(sector) {
let total_figis: usize = sector_map.values().map(|v| v.len()).sum();
if total_figis > 0 {
println!("{}: {} LEIs, {} FIGIs", sector, sector_map.len(), total_figis);
}
}
// Log final summary for no_hit LEIs (they've already been removed incrementally)
if !no_hit_leis.is_empty() {
let no_hit_summary = format!("no_hit (removed incrementally from GLEIF): {} LEIs", no_hit_leis.len());
println!("{}", no_hit_summary);
logger::log_info(&no_hit_summary).await;
}
// Return Equity sector as the main result
@@ -559,7 +657,9 @@ async fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, V
map.insert(lei, figis);
}
println!("Loaded LEI→FIGI map with {} entries from {}", map.len(), path.display());
let msg = format!("Loaded LEI→FIGI map with {} entries from {}", map.len(), path.display());
logger::log_info(&msg).await;
Ok(map)
}
@@ -596,6 +696,160 @@ async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) ->
Ok(())
}
/// Removes a single invalid LEI from the GLEIF CSV file immediately.
///
/// This function is called after each no_hit detection to prevent progress loss on interrupt.
/// It reads the GLEIF CSV, filters out the specific LEI, and overwrites the file.
///
/// # Arguments
///
/// * `gleif_cache_dir` - Path to the cache/gleif directory
/// * `lei` - The LEI string to remove
///
/// # Returns
/// Ok(()) if successful, Err if file operations fail.
async fn remove_lei_from_gleif_csv_single(gleif_cache_dir: &Path, lei: &str) -> anyhow::Result<()> {
// Find the most recent GLEIF CSV file
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
.await
.context("Failed to read gleif cache directory")?;
let mut csv_files = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(filename) = path.file_name() {
let filename_str = filename.to_string_lossy();
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
csv_files.push(path);
}
}
}
if csv_files.is_empty() {
return Ok(());
}
// Sort and get the most recent
csv_files.sort();
csv_files.reverse();
let gleif_file = &csv_files[0];
// Read the CSV file
let content = tokio_fs::read_to_string(gleif_file)
.await
.context("Failed to read GLEIF CSV")?;
// Filter out line with this LEI
let filtered_lines: Vec<&str> = content
.lines()
.filter(|line| {
// GLEIF CSV format: ISIN,LEI
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 2 {
parts[1] != lei
} else {
true // Keep lines that don't match the format (e.g., header)
}
})
.collect();
// Only write if something was actually removed
if filtered_lines.len() < content.lines().count() {
let new_content = filtered_lines.join("\n") + "\n";
tokio_fs::write(gleif_file, new_content)
.await
.context("Failed to write filtered GLEIF CSV")?;
}
Ok(())
}
/// Removes invalid LEIs from the GLEIF CSV file.
///
/// When an API call succeeds but returns no data (no_hit), the LEI is considered invalid
/// and should be removed from the GLEIF CSV to prevent re-scraping on future runs.
///
/// This function reads the GLEIF CSV, filters out the specified LEIs, and overwrites the file.
///
/// # Arguments
///
/// * `gleif_cache_dir` - Path to the cache/gleif directory
/// * `leis_to_remove` - Vec of LEI strings to remove
///
/// # Returns
/// Ok(()) if successful, Err if file operations fail.
async fn remove_leis_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove: &[String]) -> anyhow::Result<()> {
logger::log_info(&format!("Removing {} invalid LEIs from GLEIF CSV...", leis_to_remove.len())).await;
// Find the most recent GLEIF CSV file
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
.await
.context("Failed to read gleif cache directory")?;
let mut csv_files = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(filename) = path.file_name() {
let filename_str = filename.to_string_lossy();
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
csv_files.push(path);
}
}
}
if csv_files.is_empty() {
logger::log_warn("No GLEIF CSV files found for removal operation").await;
return Ok(());
}
// Sort and get the most recent
csv_files.sort();
csv_files.reverse();
let gleif_file = &csv_files[0];
let debug_msg = format!("Reading GLEIF file: {}", gleif_file.display());
logger::log_info(&debug_msg).await;
// Read the CSV file
let content = tokio_fs::read_to_string(gleif_file)
.await
.context("Failed to read GLEIF CSV")?;
let original_lines = content.lines().count();
// Convert LEIs to remove into a HashSet for faster lookup
let remove_set: std::collections::HashSet<_> = leis_to_remove.iter().cloned().collect();
// Filter out lines with LEIs to remove
let filtered_lines: Vec<&str> = content
.lines()
.filter(|line| {
// GLEIF CSV format: ISIN,LEI
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 2 {
!remove_set.contains(parts[1])
} else {
true // Keep lines that don't match the format (e.g., header)
}
})
.collect();
let removed_count = original_lines - filtered_lines.len();
// Write back the filtered content
let new_content = filtered_lines.join("\n") + "\n";
tokio_fs::write(gleif_file, new_content)
.await
.context("Failed to write filtered GLEIF CSV")?;
let success_msg = format!("✓ Removed {} invalid LEIs from GLEIF CSV (was {} lines, now {} lines)", leis_to_remove.len(), original_lines, filtered_lines.len());
println!("{}", success_msg);
logger::log_info(&success_msg).await;
Ok(())
}
/// Loads or builds HashMaps for companies, warrants, and options.
///
/// This function:
@@ -1045,7 +1299,7 @@ where
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
println!("Loading OpenFIGI mapping value lists...");
let client = OpenFigiClient::new()?;
let client = OpenFigiClient::new().await?;
// Create cache directory
let dir = DataPaths::new(".")?;