removing not map-able LEIs
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
use crate::util::directories::DataPaths;
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
|
|
||||||
// src/corporate/openfigi.rs
|
// src/corporate/openfigi.rs
|
||||||
use super::{types::*};
|
use super::{types::*};
|
||||||
@@ -27,7 +28,7 @@ impl OpenFigiClient {
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
///
|
///
|
||||||
/// Returns an error if the HTTP client cannot be built or if the API key header is invalid.
|
/// Returns an error if the HTTP client cannot be built or if the API key header is invalid.
|
||||||
pub fn new() -> anyhow::Result<Self> {
|
pub async fn new() -> anyhow::Result<Self> {
|
||||||
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
|
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
|
||||||
let has_key = api_key.is_some();
|
let has_key = api_key.is_some();
|
||||||
|
|
||||||
@@ -43,10 +44,11 @@ impl OpenFigiClient {
|
|||||||
|
|
||||||
let client = builder.build().context("Failed to build HTTP client")?;
|
let client = builder.build().context("Failed to build HTTP client")?;
|
||||||
|
|
||||||
println!(
|
let msg = format!(
|
||||||
"OpenFIGI client initialized: {}",
|
"OpenFIGI client initialized: {}",
|
||||||
if has_key { "with API key" } else { "no key (limited mode)" }
|
if has_key { "with API key" } else { "no key (limited mode)" }
|
||||||
);
|
);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
Ok(Self { client, has_key })
|
Ok(Self { client, has_key })
|
||||||
}
|
}
|
||||||
@@ -126,10 +128,16 @@ impl OpenFigiClient {
|
|||||||
Err(e) => {
|
Err(e) => {
|
||||||
retry_count += 1;
|
retry_count += 1;
|
||||||
if retry_count >= max_retries {
|
if retry_count >= max_retries {
|
||||||
return Err(anyhow!("Failed to send mapping request after {} retries: {}", max_retries, e));
|
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
|
||||||
|
logger::log_error(&err_msg).await;
|
||||||
|
return Err(anyhow!(err_msg));
|
||||||
}
|
}
|
||||||
eprintln!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
|
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
|
||||||
println!(" Retrying in {}ms...", backoff_ms);
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
|
||||||
|
println!("{}", retry_msg);
|
||||||
|
logger::log_info(&retry_msg).await;
|
||||||
sleep(Duration::from_millis(backoff_ms)).await;
|
sleep(Duration::from_millis(backoff_ms)).await;
|
||||||
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
|
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
|
||||||
continue;
|
continue;
|
||||||
@@ -147,7 +155,9 @@ impl OpenFigiClient {
|
|||||||
.and_then(|v| v.to_str().ok())
|
.and_then(|v| v.to_str().ok())
|
||||||
.and_then(|s| s.parse::<u64>().ok())
|
.and_then(|s| s.parse::<u64>().ok())
|
||||||
.unwrap_or(10);
|
.unwrap_or(10);
|
||||||
println!("Rate limited—backing off {}s", reset_sec);
|
let rate_msg = format!("Rate limited—backing off {}s", reset_sec);
|
||||||
|
println!("{}", rate_msg);
|
||||||
|
logger::log_warn(&rate_msg).await;
|
||||||
sleep(Duration::from_secs(reset_sec.max(10))).await;
|
sleep(Duration::from_secs(reset_sec.max(10))).await;
|
||||||
continue; // Retry the same chunk
|
continue; // Retry the same chunk
|
||||||
} else if status == 401 {
|
} else if status == 401 {
|
||||||
@@ -158,9 +168,13 @@ impl OpenFigiClient {
|
|||||||
// Transient server error, retry with backoff
|
// Transient server error, retry with backoff
|
||||||
retry_count += 1;
|
retry_count += 1;
|
||||||
if retry_count >= max_retries {
|
if retry_count >= max_retries {
|
||||||
return Err(anyhow!("OpenFIGI server error {} after {} retries: {}", status, max_retries, body));
|
let err_msg = format!("OpenFIGI server error {} after {} retries: {}", status, max_retries, body);
|
||||||
|
logger::log_error(&err_msg).await;
|
||||||
|
return Err(anyhow!(err_msg));
|
||||||
}
|
}
|
||||||
eprintln!("Server error {} (attempt {}/{}), retrying in {}ms...", status, retry_count, max_retries, backoff_ms);
|
let warn_msg = format!("Server error {} (attempt {}/{}), retrying in {}ms...", status, retry_count, max_retries, backoff_ms);
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
sleep(Duration::from_millis(backoff_ms)).await;
|
sleep(Duration::from_millis(backoff_ms)).await;
|
||||||
backoff_ms = (backoff_ms * 2).min(60000);
|
backoff_ms = (backoff_ms * 2).min(60000);
|
||||||
continue;
|
continue;
|
||||||
@@ -260,7 +274,9 @@ async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
|
|||||||
|
|
||||||
if !cache_file.exists() {
|
if !cache_file.exists() {
|
||||||
// Return default if file doesn't exist
|
// Return default if file doesn't exist
|
||||||
eprintln!("Warning: {} not found, using default sectors", cache_file.display());
|
let warn_msg = format!("Warning: {} not found, using default sectors", cache_file.display());
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
return Ok(vec![
|
return Ok(vec![
|
||||||
"Comdty".to_string(),
|
"Comdty".to_string(),
|
||||||
"Corp".to_string(),
|
"Corp".to_string(),
|
||||||
@@ -292,7 +308,8 @@ async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
|
|||||||
return Err(anyhow!("No sectors found in marketSecDes.json"));
|
return Err(anyhow!("No sectors found in marketSecDes.json"));
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("Loaded {} market sectors from cache", sectors.len());
|
let msg = format!("Loaded {} market sectors from cache", sectors.len());
|
||||||
|
logger::log_info(&msg).await;
|
||||||
Ok(sectors)
|
Ok(sectors)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -328,7 +345,9 @@ async fn find_most_recent_gleif_date(gleif_cache_dir: &Path) -> anyhow::Result<O
|
|||||||
let most_recent = &csv_files[0];
|
let most_recent = &csv_files[0];
|
||||||
let date = extract_gleif_date_from_filename(most_recent);
|
let date = extract_gleif_date_from_filename(most_recent);
|
||||||
|
|
||||||
println!(" Found GLEIF data dated: {}", date);
|
let msg = format!(" Found GLEIF data dated: {}", date);
|
||||||
|
|
||||||
|
logger::log_info(&msg).await;
|
||||||
Ok(Some(date))
|
Ok(Some(date))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -359,7 +378,9 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
|
|||||||
match build_lei_to_figi_infos_internal(lei_to_isins, gleif_date).await {
|
match build_lei_to_figi_infos_internal(lei_to_isins, gleif_date).await {
|
||||||
Ok(map) => {
|
Ok(map) => {
|
||||||
if !map.is_empty() {
|
if !map.is_empty() {
|
||||||
println!("✓ LEI→FIGI mapping completed successfully with {} entries", map.len());
|
let msg = format!("✓ LEI→FIGI mapping completed successfully with {} entries", map.len());
|
||||||
|
|
||||||
|
logger::log_info(&msg).await;
|
||||||
}
|
}
|
||||||
return Ok(map);
|
return Ok(map);
|
||||||
}
|
}
|
||||||
@@ -372,19 +393,27 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
|
|||||||
|| error_msg.contains("Failed to create");
|
|| error_msg.contains("Failed to create");
|
||||||
|
|
||||||
if is_fatal {
|
if is_fatal {
|
||||||
eprintln!("Fatal error in LEI→FIGI mapping: {}", e);
|
let err = format!("Fatal error in LEI→FIGI mapping: {}", e);
|
||||||
|
eprintln!("{}", err);
|
||||||
|
logger::log_error(&err).await;
|
||||||
return Err(e);
|
return Err(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
retry_count += 1;
|
retry_count += 1;
|
||||||
if retry_count >= max_retries {
|
if retry_count >= max_retries {
|
||||||
eprintln!("LEI→FIGI mapping failed after {} retries: {}", max_retries, e);
|
let err = format!("LEI→FIGI mapping failed after {} retries: {}", max_retries, e);
|
||||||
|
eprintln!("{}", err);
|
||||||
|
logger::log_error(&err).await;
|
||||||
return Err(e);
|
return Err(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
let wait_secs = 60 * retry_count;
|
let wait_secs = 60 * retry_count;
|
||||||
eprintln!("Transient error in LEI→FIGI mapping (attempt {}/{}): {}", retry_count, max_retries, e);
|
let warn_msg = format!("Transient error in LEI→FIGI mapping (attempt {}/{}): {}", retry_count, max_retries, e);
|
||||||
println!("Retrying mapping in {}s...", wait_secs);
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
let retry_msg = format!("Retrying mapping in {}s...", wait_secs);
|
||||||
|
println!("{}", retry_msg);
|
||||||
|
logger::log_info(&retry_msg).await;
|
||||||
sleep(Duration::from_secs(wait_secs as u64)).await;
|
sleep(Duration::from_secs(wait_secs as u64)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -396,6 +425,11 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
|
|||||||
/// This is the actual worker function that performs the mapping. It handles already-processed
|
/// This is the actual worker function that performs the mapping. It handles already-processed
|
||||||
/// LEIs gracefully but will fail on transient errors, which are caught and retried by the
|
/// LEIs gracefully but will fail on transient errors, which are caught and retried by the
|
||||||
/// wrapper function build_lei_to_figi_infos.
|
/// wrapper function build_lei_to_figi_infos.
|
||||||
|
///
|
||||||
|
/// Tracks three outcomes:
|
||||||
|
/// 1. Hit with marketSector: saved to sector-specific folder
|
||||||
|
/// 2. Hit without marketSector: saved to "uncategorized" folder
|
||||||
|
/// 3. No_hit (empty results): LEI marked for removal from GLEIF CSV
|
||||||
async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<String>>, gleif_date: Option<&str>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
|
async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<String>>, gleif_date: Option<&str>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
|
||||||
let dir = DataPaths::new(".")?;
|
let dir = DataPaths::new(".")?;
|
||||||
let gleif_cache_dir = dir.cache_gleif_dir();
|
let gleif_cache_dir = dir.cache_gleif_dir();
|
||||||
@@ -403,23 +437,42 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
|
|||||||
|
|
||||||
// Determine the GLEIF date to use
|
// Determine the GLEIF date to use
|
||||||
let date = if let Some(d) = gleif_date {
|
let date = if let Some(d) = gleif_date {
|
||||||
|
let msg = format!("Using provided GLEIF date: {}", d);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
d.to_string()
|
d.to_string()
|
||||||
} else {
|
} else {
|
||||||
// Find the most recent GLEIF file
|
// Find the most recent GLEIF file
|
||||||
|
logger::log_info("Searching for most recent GLEIF file...").await;
|
||||||
match find_most_recent_gleif_date(gleif_cache_dir).await? {
|
match find_most_recent_gleif_date(gleif_cache_dir).await? {
|
||||||
Some(d) => d,
|
Some(d) => d,
|
||||||
None => return Err(anyhow!("No GLEIF CSV file found in cache/gleif directory")),
|
None => {
|
||||||
|
let err = "No GLEIF CSV file found in cache/gleif directory";
|
||||||
|
logger::log_error(err).await;
|
||||||
|
return Err(anyhow!(err));
|
||||||
|
},
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Create date-based subdirectory in the mapping cache
|
// Create date-based subdirectory in the mapping cache
|
||||||
|
let msg = format!("Creating date directory for: {}", date);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
let date_dir = map_cache_dir.join(&date);
|
let date_dir = map_cache_dir.join(&date);
|
||||||
tokio_fs::create_dir_all(&date_dir).await.context("Failed to create date directory")?;
|
tokio_fs::create_dir_all(&date_dir).await.context("Failed to create date directory")?;
|
||||||
|
|
||||||
// Load market sectors dynamically from cache
|
// Load market sectors dynamically from cache
|
||||||
|
logger::log_info("Loading market sectors...").await;
|
||||||
let sector_dirs = load_market_sectors().await?;
|
let sector_dirs = load_market_sectors().await?;
|
||||||
let mut sector_maps: HashMap<String, HashMap<String, Vec<FigiInfo>>> = HashMap::new();
|
let mut sector_maps: HashMap<String, HashMap<String, Vec<FigiInfo>>> = HashMap::new();
|
||||||
|
|
||||||
|
// Create uncategorized folder
|
||||||
|
let msg = format!("Creating {} sector directories...", sector_dirs.len());
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
let uncategorized_dir = date_dir.join("uncategorized");
|
||||||
|
tokio_fs::create_dir_all(&uncategorized_dir).await.context("Failed to create uncategorized directory")?;
|
||||||
|
let uncategorized_path = uncategorized_dir.join("lei_to_figi.jsonl");
|
||||||
|
let uncategorized_map = load_lei_to_figi_jsonl(&uncategorized_path).await?;
|
||||||
|
sector_maps.insert("uncategorized".to_string(), uncategorized_map);
|
||||||
|
|
||||||
for sector in §or_dirs {
|
for sector in §or_dirs {
|
||||||
let sector_dir = date_dir.join(sector);
|
let sector_dir = date_dir.join(sector);
|
||||||
tokio_fs::create_dir_all(§or_dir).await.context("Failed to create sector directory")?;
|
tokio_fs::create_dir_all(§or_dir).await.context("Failed to create sector directory")?;
|
||||||
@@ -430,22 +483,30 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
|
|||||||
sector_maps.insert(sector.clone(), lei_map);
|
sector_maps.insert(sector.clone(), lei_map);
|
||||||
}
|
}
|
||||||
|
|
||||||
let client = OpenFigiClient::new()?;
|
let client = OpenFigiClient::new().await?;
|
||||||
if !client.has_key {
|
if !client.has_key {
|
||||||
let total_entries: usize = sector_maps.values().map(|m| m.len()).sum();
|
let total_entries: usize = sector_maps.values().map(|m| m.len()).sum();
|
||||||
println!("No API key—using partial LEI→FIGI maps with {} total entries", total_entries);
|
let msg = format!("No API key—using partial LEI→FIGI maps with {} total entries", total_entries);
|
||||||
|
|
||||||
|
logger::log_warn(&msg).await;
|
||||||
return Ok(sector_maps.get("Equity").cloned().unwrap_or_default());
|
return Ok(sector_maps.get("Equity").cloned().unwrap_or_default());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort LEIs for deterministic processing order
|
// Sort LEIs for deterministic processing order
|
||||||
|
logger::log_info("Starting LEI→FIGI mapping process...").await;
|
||||||
let mut leis: Vec<_> = lei_to_isins.keys().cloned().collect();
|
let mut leis: Vec<_> = lei_to_isins.keys().cloned().collect();
|
||||||
leis.sort();
|
leis.sort();
|
||||||
|
|
||||||
let mut processed = sector_maps.values().map(|m| m.len()).sum::<usize>();
|
let mut processed = sector_maps.values().map(|m| m.len()).sum::<usize>();
|
||||||
let total = leis.len();
|
let total = leis.len();
|
||||||
|
let mut no_hit_leis = Vec::new(); // Track LEIs with no data found (no_hit)
|
||||||
|
|
||||||
|
let msg = format!("Total LEIs to process: {}, already processed: {}", total, processed);
|
||||||
|
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
for lei in leis {
|
for lei in leis {
|
||||||
// Check if LEI is already processed in any sector
|
// Check if LEI is already processed in any sector (including uncategorized)
|
||||||
let mut already_processed = false;
|
let mut already_processed = false;
|
||||||
for sector_map in sector_maps.values() {
|
for sector_map in sector_maps.values() {
|
||||||
if sector_map.contains_key(&lei) {
|
if sector_map.contains_key(&lei) {
|
||||||
@@ -464,18 +525,57 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
|
|||||||
};
|
};
|
||||||
|
|
||||||
let unique_isins: Vec<_> = isins.iter().cloned().collect::<HashSet<_>>().into_iter().collect();
|
let unique_isins: Vec<_> = isins.iter().cloned().collect::<HashSet<_>>().into_iter().collect();
|
||||||
|
let debug_msg = format!("Processing LEI {} with {} ISINs...", lei, unique_isins.len());
|
||||||
|
logger::log_info(&debug_msg).await;
|
||||||
|
|
||||||
let all_figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
|
let all_figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
|
||||||
|
|
||||||
|
// Case 1: no_hit - API succeeded but returned no data
|
||||||
|
if all_figi_infos.is_empty() {
|
||||||
|
let no_hit_msg = format!(" no_hit: LEI {} returned no FIGIs", lei);
|
||||||
|
logger::log_warn(&no_hit_msg).await;
|
||||||
|
no_hit_leis.push(lei.clone());
|
||||||
|
|
||||||
|
// Remove immediately from GLEIF CSV to prevent progress loss on interrupt
|
||||||
|
if let Err(e) = remove_lei_from_gleif_csv_single(gleif_cache_dir, &lei).await {
|
||||||
|
let warn_msg = format!("Warning: Failed to remove LEI {} from GLEIF CSV: {}", lei, e);
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let hit_msg = format!(" hit: LEI {} found {} FIGIs", lei, all_figi_infos.len());
|
||||||
|
logger::log_info(&hit_msg).await;
|
||||||
|
|
||||||
// Organize results by marketSector
|
// Organize results by marketSector
|
||||||
let mut figis_by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
|
let mut figis_by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
|
||||||
|
let mut uncategorized_figis = Vec::new();
|
||||||
|
|
||||||
for figi_info in all_figi_infos {
|
for figi_info in all_figi_infos {
|
||||||
let sector = figi_info.marketSector.clone();
|
let sector = figi_info.marketSector.clone();
|
||||||
if sector.is_empty() {
|
|
||||||
continue; // Skip if no sector
|
|
||||||
}
|
|
||||||
|
|
||||||
figis_by_sector.entry(sector).or_insert_with(Vec::new).push(figi_info);
|
if sector.is_empty() {
|
||||||
|
// Case 2: Hit but no marketSecDes - save to uncategorized
|
||||||
|
uncategorized_figis.push(figi_info);
|
||||||
|
} else {
|
||||||
|
// Case 1: Hit with marketSector - organize by sector
|
||||||
|
figis_by_sector.entry(sector).or_insert_with(Vec::new).push(figi_info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save uncategorized FIGIs if any
|
||||||
|
if !uncategorized_figis.is_empty() {
|
||||||
|
uncategorized_figis.sort_by_key(|f| f.figi.clone());
|
||||||
|
uncategorized_figis.dedup_by_key(|f| f.figi.clone());
|
||||||
|
|
||||||
|
append_lei_to_figi_jsonl(&uncategorized_path, &lei, &uncategorized_figis).await
|
||||||
|
.context("Failed to append to uncategorized JSONL")?;
|
||||||
|
|
||||||
|
if let Some(uncategorized_map) = sector_maps.get_mut("uncategorized") {
|
||||||
|
uncategorized_map.insert(lei.clone(), uncategorized_figis);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save to appropriate sector files
|
// Save to appropriate sector files
|
||||||
@@ -502,21 +602,19 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
|
|||||||
let count = sector_maps.get(s).map(|m| m.len()).unwrap_or(0);
|
let count = sector_maps.get(s).map(|m| m.len()).unwrap_or(0);
|
||||||
format!("{}:{}", s, count)
|
format!("{}:{}", s, count)
|
||||||
}).collect();
|
}).collect();
|
||||||
println!("Processed {}/{} LEIs → [{}]", processed, total, totals.join(", "));
|
let progress_msg = format!("Processed {}/{} LEIs → [{}] no_hit: {}", processed, total, totals.join(", "), no_hit_leis.len());
|
||||||
|
println!("{}", progress_msg);
|
||||||
|
logger::log_info(&progress_msg).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Print final summary
|
// Log final summary for no_hit LEIs (they've already been removed incrementally)
|
||||||
println!("\n=== LEI→FIGI Mapping Complete ===");
|
if !no_hit_leis.is_empty() {
|
||||||
for sector in §or_dirs {
|
let no_hit_summary = format!("no_hit (removed incrementally from GLEIF): {} LEIs", no_hit_leis.len());
|
||||||
if let Some(sector_map) = sector_maps.get(sector) {
|
println!("{}", no_hit_summary);
|
||||||
let total_figis: usize = sector_map.values().map(|v| v.len()).sum();
|
logger::log_info(&no_hit_summary).await;
|
||||||
if total_figis > 0 {
|
|
||||||
println!("{}: {} LEIs, {} FIGIs", sector, sector_map.len(), total_figis);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return Equity sector as the main result
|
// Return Equity sector as the main result
|
||||||
@@ -559,7 +657,9 @@ async fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, V
|
|||||||
map.insert(lei, figis);
|
map.insert(lei, figis);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("Loaded LEI→FIGI map with {} entries from {}", map.len(), path.display());
|
let msg = format!("Loaded LEI→FIGI map with {} entries from {}", map.len(), path.display());
|
||||||
|
|
||||||
|
logger::log_info(&msg).await;
|
||||||
Ok(map)
|
Ok(map)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -596,6 +696,160 @@ async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) ->
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Removes a single invalid LEI from the GLEIF CSV file immediately.
|
||||||
|
///
|
||||||
|
/// This function is called after each no_hit detection to prevent progress loss on interrupt.
|
||||||
|
/// It reads the GLEIF CSV, filters out the specific LEI, and overwrites the file.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `gleif_cache_dir` - Path to the cache/gleif directory
|
||||||
|
/// * `lei` - The LEI string to remove
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) if successful, Err if file operations fail.
|
||||||
|
async fn remove_lei_from_gleif_csv_single(gleif_cache_dir: &Path, lei: &str) -> anyhow::Result<()> {
|
||||||
|
// Find the most recent GLEIF CSV file
|
||||||
|
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to read gleif cache directory")?;
|
||||||
|
|
||||||
|
let mut csv_files = Vec::new();
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if let Some(filename) = path.file_name() {
|
||||||
|
let filename_str = filename.to_string_lossy();
|
||||||
|
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
|
||||||
|
csv_files.push(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if csv_files.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort and get the most recent
|
||||||
|
csv_files.sort();
|
||||||
|
csv_files.reverse();
|
||||||
|
let gleif_file = &csv_files[0];
|
||||||
|
|
||||||
|
// Read the CSV file
|
||||||
|
let content = tokio_fs::read_to_string(gleif_file)
|
||||||
|
.await
|
||||||
|
.context("Failed to read GLEIF CSV")?;
|
||||||
|
|
||||||
|
// Filter out line with this LEI
|
||||||
|
let filtered_lines: Vec<&str> = content
|
||||||
|
.lines()
|
||||||
|
.filter(|line| {
|
||||||
|
// GLEIF CSV format: ISIN,LEI
|
||||||
|
let parts: Vec<&str> = line.split(',').collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
parts[1] != lei
|
||||||
|
} else {
|
||||||
|
true // Keep lines that don't match the format (e.g., header)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Only write if something was actually removed
|
||||||
|
if filtered_lines.len() < content.lines().count() {
|
||||||
|
let new_content = filtered_lines.join("\n") + "\n";
|
||||||
|
tokio_fs::write(gleif_file, new_content)
|
||||||
|
.await
|
||||||
|
.context("Failed to write filtered GLEIF CSV")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Removes invalid LEIs from the GLEIF CSV file.
|
||||||
|
///
|
||||||
|
/// When an API call succeeds but returns no data (no_hit), the LEI is considered invalid
|
||||||
|
/// and should be removed from the GLEIF CSV to prevent re-scraping on future runs.
|
||||||
|
///
|
||||||
|
/// This function reads the GLEIF CSV, filters out the specified LEIs, and overwrites the file.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `gleif_cache_dir` - Path to the cache/gleif directory
|
||||||
|
/// * `leis_to_remove` - Vec of LEI strings to remove
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) if successful, Err if file operations fail.
|
||||||
|
async fn remove_leis_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove: &[String]) -> anyhow::Result<()> {
|
||||||
|
logger::log_info(&format!("Removing {} invalid LEIs from GLEIF CSV...", leis_to_remove.len())).await;
|
||||||
|
|
||||||
|
// Find the most recent GLEIF CSV file
|
||||||
|
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to read gleif cache directory")?;
|
||||||
|
|
||||||
|
let mut csv_files = Vec::new();
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if let Some(filename) = path.file_name() {
|
||||||
|
let filename_str = filename.to_string_lossy();
|
||||||
|
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
|
||||||
|
csv_files.push(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if csv_files.is_empty() {
|
||||||
|
logger::log_warn("No GLEIF CSV files found for removal operation").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort and get the most recent
|
||||||
|
csv_files.sort();
|
||||||
|
csv_files.reverse();
|
||||||
|
let gleif_file = &csv_files[0];
|
||||||
|
let debug_msg = format!("Reading GLEIF file: {}", gleif_file.display());
|
||||||
|
logger::log_info(&debug_msg).await;
|
||||||
|
|
||||||
|
// Read the CSV file
|
||||||
|
let content = tokio_fs::read_to_string(gleif_file)
|
||||||
|
.await
|
||||||
|
.context("Failed to read GLEIF CSV")?;
|
||||||
|
|
||||||
|
let original_lines = content.lines().count();
|
||||||
|
|
||||||
|
// Convert LEIs to remove into a HashSet for faster lookup
|
||||||
|
let remove_set: std::collections::HashSet<_> = leis_to_remove.iter().cloned().collect();
|
||||||
|
|
||||||
|
// Filter out lines with LEIs to remove
|
||||||
|
let filtered_lines: Vec<&str> = content
|
||||||
|
.lines()
|
||||||
|
.filter(|line| {
|
||||||
|
// GLEIF CSV format: ISIN,LEI
|
||||||
|
let parts: Vec<&str> = line.split(',').collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
!remove_set.contains(parts[1])
|
||||||
|
} else {
|
||||||
|
true // Keep lines that don't match the format (e.g., header)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let removed_count = original_lines - filtered_lines.len();
|
||||||
|
|
||||||
|
// Write back the filtered content
|
||||||
|
let new_content = filtered_lines.join("\n") + "\n";
|
||||||
|
tokio_fs::write(gleif_file, new_content)
|
||||||
|
.await
|
||||||
|
.context("Failed to write filtered GLEIF CSV")?;
|
||||||
|
|
||||||
|
let success_msg = format!("✓ Removed {} invalid LEIs from GLEIF CSV (was {} lines, now {} lines)", leis_to_remove.len(), original_lines, filtered_lines.len());
|
||||||
|
println!("{}", success_msg);
|
||||||
|
logger::log_info(&success_msg).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Loads or builds HashMaps for companies, warrants, and options.
|
/// Loads or builds HashMaps for companies, warrants, and options.
|
||||||
///
|
///
|
||||||
/// This function:
|
/// This function:
|
||||||
@@ -1045,7 +1299,7 @@ where
|
|||||||
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
|
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
|
||||||
println!("Loading OpenFIGI mapping value lists...");
|
println!("Loading OpenFIGI mapping value lists...");
|
||||||
|
|
||||||
let client = OpenFigiClient::new()?;
|
let client = OpenFigiClient::new().await?;
|
||||||
|
|
||||||
// Create cache directory
|
// Create cache directory
|
||||||
let dir = DataPaths::new(".")?;
|
let dir = DataPaths::new(".")?;
|
||||||
|
|||||||
@@ -238,7 +238,7 @@ fn infer_currency_from_ticker(ticker: &str) -> String {
|
|||||||
/// Returns an error if file operations or serialization fails.
|
/// Returns an error if file operations or serialization fails.
|
||||||
pub async fn save_companies_to_jsonl(
|
pub async fn save_companies_to_jsonl(
|
||||||
paths: &DataPaths,
|
paths: &DataPaths,
|
||||||
companies: &HashMap<String, Vec<(String, String)>>,
|
companies: &HashMap<String, HashMap<String, String>>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let file_path = paths.data_dir().join("companies.jsonl");
|
let file_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
|
||||||
|
|||||||
@@ -83,19 +83,25 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
|
|||||||
println!("{}", msg);
|
println!("{}", msg);
|
||||||
logger::log_info(&msg).await;
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
// HashMap<Name, Vec<(ISIN, Ticker)>>
|
// HashMap<Name, HashMap<ISIN, Ticker>> - unique pairs only
|
||||||
let companies: HashMap<String, Vec<(String, String)>> = securities.0
|
let companies: HashMap<String, HashMap<String, String>> = securities.0
|
||||||
.iter()
|
.iter()
|
||||||
.fold(HashMap::new(), |mut acc, security| {
|
.fold(HashMap::new(), |mut acc, security| {
|
||||||
let isin: Vec<String> = security.1.securities.values()
|
let mut isin_ticker_pairs: HashMap<String, String> = HashMap::new();
|
||||||
.flat_map(|figi_info| figi_info.iter().map(|x| x.isin.clone()))
|
|
||||||
.collect();
|
// Collect all unique ISIN-Ticker pairs
|
||||||
let ticker: Vec<String> = security.1.securities.values()
|
for figi_infos in security.1.securities.values() {
|
||||||
.flat_map(|figi_info| figi_info.iter().map(|x| x.ticker.clone()))
|
for figi_info in figi_infos {
|
||||||
.collect();
|
if !figi_info.isin.is_empty() && !figi_info.ticker.is_empty() {
|
||||||
acc.entry(security.1.name.clone())
|
isin_ticker_pairs.insert(figi_info.isin.clone(), figi_info.ticker.clone());
|
||||||
.or_insert_with(Vec::new)
|
}
|
||||||
.push((isin.join(", "), ticker.join(", ")));
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only add if there are pairs
|
||||||
|
if !isin_ticker_pairs.is_empty() {
|
||||||
|
acc.insert(security.1.name.clone(), isin_ticker_pairs);
|
||||||
|
}
|
||||||
acc
|
acc
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user