Files
WebScraper/src/corporate/openfigi.rs

1122 lines
35 KiB
Rust

// src/corporate/openfigi.rs - STREAMING VERSION
// Key changes: Never load entire GLEIF CSV or FIGI maps into memory
use crate::util::directories::DataPaths;
use crate::util::logger;
use super::types::*;
use reqwest::Client as HttpClient;
use reqwest::header::{HeaderMap, HeaderValue};
use serde_json::{json, Value};
use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::io::{BufRead, BufReader};
use tokio::time::{sleep, Duration};
use tokio::fs as tokio_fs;
use tokio::io::AsyncWriteExt;
use anyhow::{Context, anyhow};
const LEI_BATCH_SIZE: usize = 100; // Process 100 LEIs at a time
#[derive(Clone)]
pub struct OpenFigiClient {
client: HttpClient,
has_key: bool,
}
impl OpenFigiClient {
pub async fn new() -> anyhow::Result<Self> {
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
let has_key = api_key.is_some();
let mut builder = HttpClient::builder()
.user_agent("Mozilla/5.0 (compatible; OpenFIGI-Rust/1.0)")
.timeout(Duration::from_secs(30));
if let Some(key) = &api_key {
let mut headers = HeaderMap::new();
headers.insert("X-OPENFIGI-APIKEY", HeaderValue::from_str(key)?);
builder = builder.default_headers(headers);
}
let client = builder.build().context("Failed to build HTTP client")?;
logger::log_info(&format!("OpenFIGI client: {}",
if has_key { "with API key" } else { "no key" })).await;
Ok(Self { client, has_key })
}
pub async fn map_isins_to_figi_infos(&self, isins: &[String]) -> anyhow::Result<Vec<FigiInfo>> {
if isins.is_empty() {
return Ok(vec![]);
}
let mut all_figi_infos = Vec::new();
let chunk_size = if self.has_key { 100 } else { 5 };
let inter_sleep = if self.has_key {
Duration::from_millis(240)
} else {
Duration::from_millis(2400)
};
for chunk in isins.chunks(chunk_size) {
let jobs: Vec<Value> = chunk.iter()
.map(|isin| json!({
"idType": "ID_ISIN",
"idValue": isin,
}))
.collect();
let mut retry_count = 0;
let max_retries = 5;
let mut backoff_ms = 1000u64;
loop {
let resp_result = self.client
.post("https://api.openfigi.com/v3/mapping")
.header("Content-Type", "application/json")
.json(&jobs)
.send()
.await;
let resp = match resp_result {
Ok(r) => r,
Err(e) => {
retry_count += 1;
if retry_count >= max_retries {
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
logger::log_error(&err_msg).await;
return Err(anyhow!(err_msg));
}
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
logger::log_warn(&warn_msg).await;
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
logger::log_info(&retry_msg).await;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
continue;
}
};
let status = resp.status();
let headers = resp.headers().clone();
let body = resp.text().await?;
if status == 429 {
let reset_sec = headers
.get("ratelimit-reset")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
sleep(Duration::from_secs(reset_sec.max(10))).await;
continue;
} else if !status.is_success() {
if status.is_server_error() && retry_count < max_retries {
retry_count += 1;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000);
continue;
}
return Err(anyhow!("OpenFIGI error {}: {}", status, body));
}
let results: Vec<Value> = serde_json::from_str(&body)?;
for (isin, result) in chunk.iter().zip(results) {
if let Some(data) = result["data"].as_array() {
for item in data {
if let Some(figi) = item["figi"].as_str() {
all_figi_infos.push(FigiInfo {
isin: isin.clone(),
figi: figi.to_string(),
name: item["name"].as_str().unwrap_or("").to_string(),
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
exch_code: item["exchCode"].as_str().unwrap_or("").to_string(),
composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
security_type: item["securityType"].as_str().unwrap_or("").to_string(),
market_sector: item["marketSector"].as_str().unwrap_or("").to_string(),
share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
security_type2: item["securityType2"].as_str().unwrap_or("").to_string(),
security_description: item["securityDescription"].as_str().unwrap_or("").to_string(),
});
}
}
}
}
break;
}
sleep(inter_sleep).await;
}
Ok(all_figi_infos)
}
}
async fn process_and_save_figi_batch(
client: &OpenFigiClient,
lei_batch: &HashMap<String, Vec<String>>,
date_dir: &Path,
) -> anyhow::Result<()> {
for (lei, isins) in lei_batch {
let unique_isins: Vec<_> = isins.iter()
.cloned()
.collect::<HashSet<_>>()
.into_iter()
.collect();
let figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
if figi_infos.is_empty() {
// No FIGIs found - save to no_results.jsonl to avoid re-querying
append_no_result_lei(date_dir, lei, &unique_isins).await?;
continue;
}
// Save FIGIs by sector as before
save_figi_infos_by_sector(lei, &figi_infos, date_dir).await?;
}
Ok(())
}
async fn save_figi_infos_by_sector(
lei: &str,
figi_infos: &[FigiInfo],
date_dir: &Path,
) -> anyhow::Result<()> {
let mut by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
for figi_info in figi_infos {
let sector = if figi_info.market_sector.is_empty() {
"uncategorized".to_string()
} else {
figi_info.market_sector.clone()
};
by_sector.entry(sector).or_default().push(figi_info.clone());
}
for (sector, figis) in by_sector {
let sector_dir = date_dir.join(&sector);
let path = sector_dir.join("lei_to_figi.jsonl");
append_lei_to_figi_jsonl(&path, lei, &figis).await?;
}
Ok(())
}
async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> {
let entry = json!({
"lei": lei,
"figis": figis,
});
let line = serde_json::to_string(&entry)? + "\n";
let mut file = tokio_fs::OpenOptions::new()
.create(true)
.append(true)
.open(path)
.await?;
file.write_all(line.as_bytes()).await?;
Ok(())
}
/// STREAMING: Build securities without loading everything into memory
pub async fn build_securities_from_figi_streaming(
date_dir: &Path,
) -> anyhow::Result<()> {
logger::log_info("Building securities (streaming mode)...").await;
// Load existing incrementally
let mut commons = load_from_cache_if_exists::<HashMap<String, CompanyInfo>>(
"data/corporate/by_name/common_stocks.json"
).await?;
let equity_file = date_dir.join("Equity").join("lei_to_figi.jsonl");
if !equity_file.exists() {
logger::log_warn("No Equity FIGI file found").await;
return Ok(());
}
let content = tokio_fs::read_to_string(&equity_file).await?;
let mut processed = 0;
let mut stats = ProcessingStats::new(commons.len(), 0, 0);
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
let entry: Value = serde_json::from_str(line)?;
let figi_infos: Vec<FigiInfo> = serde_json::from_value(entry["figis"].clone())?;
// Process only common stocks
let common_stocks: Vec<_> = figi_infos.iter()
.filter(|f| f.security_type == "Common Stock")
.cloned()
.collect();
if !common_stocks.is_empty() {
process_common_stocks(&mut commons, &common_stocks, &mut stats);
}
processed += 1;
if processed % 100 == 0 {
tokio::task::yield_now().await;
}
}
logger::log_info(&format!("Processed {} FIGI entries", processed)).await;
save_to_cache("data/corporate/by_name/common_stocks.json", &commons).await?;
Ok(())
}
/// Handles rate limit responses from the OpenFIGI API.
///
/// If a 429 status is received, this function sleeps for the duration specified
/// in the `ratelimit-reset` header (or 10 seconds by default).
///
/// # Arguments
/// * `resp` - The HTTP response to check.
///
/// # Returns
/// Ok(()) if no rate limit, or after waiting for the reset period.
///
/// # Errors
/// Returns an error if the response status indicates a non-rate-limit error.
async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> {
let status = resp.status();
if status == 429 {
let headers = resp.headers();
let reset_sec = headers
.get("ratelimit-reset")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
logger::log_info(&format!(" Rate limited—waiting {}s", reset_sec)).await;
sleep(std::time::Duration::from_secs(reset_sec.max(10))).await;
return Err(anyhow!("Rate limited, please retry"));
} else if status.is_client_error() || status.is_server_error() {
return Err(anyhow!("OpenFIGI API error: {}", status));
}
Ok(())
}
fn process_common_stocks(
companies: &mut HashMap<String, CompanyInfo>,
figi_infos: &[FigiInfo],
stats: &mut ProcessingStats,
) {
let name = figi_infos[0].name.clone();
if name.is_empty() {
return;
}
let grouped_by_isin = group_by_isin(figi_infos);
if let Some(existing) = companies.get_mut(&name) {
let mut updated = false;
for (isin, new_figis) in grouped_by_isin {
if let Some(existing_figis) = existing.securities.get_mut(&isin) {
let merged = merge_figi_list(existing_figis, &new_figis);
if merged.len() > existing_figis.len() {
*existing_figis = merged;
updated = true;
}
} else {
existing.securities.insert(isin.clone(), new_figis);
updated = true;
}
}
if existing.primary_isin.is_empty() {
if let Some(first_isin) = existing.securities.keys().next() {
existing.primary_isin = first_isin.clone();
}
}
if updated {
stats.companies_updated += 1;
}
} else {
let primary_isin = grouped_by_isin.keys().next().cloned().unwrap_or_default();
companies.insert(name.clone(), CompanyInfo {
name,
primary_isin,
securities: grouped_by_isin,
});
stats.companies_added += 1;
}
}
fn group_by_isin(figi_infos: &[FigiInfo]) -> HashMap<String, Vec<FigiInfo>> {
let mut grouped: HashMap<String, Vec<FigiInfo>> = HashMap::new();
for figi_info in figi_infos {
grouped.entry(figi_info.isin.clone())
.or_insert_with(Vec::new)
.push(figi_info.clone());
}
for figis in grouped.values_mut() {
figis.sort_by(|a, b| a.figi.cmp(&b.figi));
}
grouped
}
fn merge_figi_list(existing: &[FigiInfo], new_figis: &[FigiInfo]) -> Vec<FigiInfo> {
let mut merged = existing.to_vec();
let existing_figis: HashSet<String> = existing.iter()
.map(|f| f.figi.clone())
.collect();
for new_figi in new_figis {
if !existing_figis.contains(&new_figi.figi) {
merged.push(new_figi.clone());
}
}
merged.sort_by(|a, b| a.figi.cmp(&b.figi));
merged
}
#[derive(Debug)]
struct ProcessingStats {
initial_companies: usize,
companies_added: usize,
companies_updated: usize,
}
impl ProcessingStats {
fn new(companies: usize, _warrants: usize, _options: usize) -> Self {
Self {
initial_companies: companies,
companies_added: 0,
companies_updated: 0,
}
}
}
async fn load_from_cache_if_exists<T>(path: &str) -> anyhow::Result<T>
where
T: serde::de::DeserializeOwned + Default,
{
let cache_file = Path::new(path);
if !cache_file.exists() {
return Ok(T::default());
}
let content = tokio_fs::read_to_string(cache_file).await?;
Ok(serde_json::from_str(&content)?)
}
async fn save_to_cache<T>(path: &str, data: &T) -> anyhow::Result<()>
where
T: serde::Serialize,
{
let cache_path = Path::new(path);
let cache_dir = cache_path.parent().context("Invalid path")?;
tokio_fs::create_dir_all(cache_dir).await?;
let json_str = serde_json::to_string_pretty(data)?;
tokio_fs::write(cache_path, json_str).await?;
Ok(())
}
async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
let dir = DataPaths::new(".")?;
let cache_file = dir.cache_openfigi_dir().join("marketSecDes.json");
if !cache_file.exists() {
return Ok(vec![
"Comdty".to_string(),
"Corp".to_string(),
"Equity".to_string(),
"Govt".to_string(),
]);
}
let content = tokio_fs::read_to_string(&cache_file).await?;
let json: Value = serde_json::from_str(&content)?;
let sectors: Vec<String> = json["values"]
.as_array()
.ok_or_else(|| anyhow!("No values"))?
.iter()
.filter_map(|v| v.as_str().map(|s| s.to_string()))
.collect();
Ok(sectors)
}
async fn determine_gleif_date(
gleif_date: Option<&str>,
paths: &DataPaths,
) -> anyhow::Result<String> {
if let Some(d) = gleif_date {
return Ok(d.to_string());
}
let gleif_dir = paths.cache_gleif_dir();
let mut entries = tokio_fs::read_dir(gleif_dir).await?;
let mut dates = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
dates.push(name.to_string());
}
}
}
}
dates.sort();
dates.last().cloned().ok_or_else(|| anyhow!("No GLEIF date found"))
}
async fn setup_sector_directories(
date_dir: &Path,
sector_dirs: &[String],
) -> anyhow::Result<()> {
let uncategorized_dir = date_dir.join("uncategorized");
tokio_fs::create_dir_all(&uncategorized_dir).await?;
for sector in sector_dirs {
let sector_dir = date_dir.join(sector);
tokio_fs::create_dir_all(&sector_dir).await?;
}
Ok(())
}
/// Loads all OpenFIGI mapping value lists (marketSecDes, micCode, securityType).
///
/// This function fetches the available values for each mapping parameter from the OpenFIGI API
/// and caches them as JSON files in `data/openfigi/`. If the files already exist and are recent
/// (less than 30 days old), they are reused instead of re-fetching.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if API requests fail, JSON parsing fails, or file I/O fails.
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
logger::log_info("Loading OpenFIGI mapping value lists...").await;
let client = OpenFigiClient::new().await?;
// Create cache directory
let dir = DataPaths::new(".")?;
let cache_dir = dir.cache_openfigi_dir();
tokio_fs::create_dir_all(cache_dir).await
.context("Failed to create data/openfigi directory")?;
// Fetch each type list
get_figi_market_sec_des(&client, cache_dir).await?;
get_figi_mic_code(&client, cache_dir).await?;
get_figi_security_type(&client, cache_dir).await?;
logger::log_info("OpenFIGI mapping value lists loaded successfully").await;
Ok(())
}
/// Fetches and caches the list of valid marketSecDes values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_market_sec_des(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("marketSecDes.json");
// Check if cache exists and is recent (< 30 days old)
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached marketSecDes values").await;
return Ok(());
}
logger::log_info(" Fetching marketSecDes values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/marketSecDes")
.send()
.await
.context("Failed to fetch marketSecDes values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse marketSecDes response")?;
// Save to cache
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write marketSecDes cache")?;
logger::log_info(" ✓ Cached marketSecDes values").await;
// Respect rate limits
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Fetches and caches the list of valid micCode values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_mic_code(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("micCode.json");
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached micCode values").await;
return Ok(());
}
logger::log_info(" Fetching micCode values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/micCode")
.send()
.await
.context("Failed to fetch micCode values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse micCode response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write micCode cache")?;
logger::log_info(" ✓ Cached micCode values").await;
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Checks if a cache file exists and is less than 30 days old.
///
/// # Arguments
/// * `path` - Path to the cache file.
///
/// # Returns
/// True if the cache should be used, false if it needs refreshing.
async fn should_use_cache(path: &Path) -> anyhow::Result<bool> {
if !path.exists() {
return Ok(false);
}
let metadata = tokio_fs::metadata(path).await?;
let modified = metadata.modified()?;
let age = modified.elapsed().unwrap_or(std::time::Duration::from_secs(u64::MAX));
// Cache is valid for 30 days
Ok(age < std::time::Duration::from_secs(30 * 24 * 60 * 60))
}
/// Fetches and caches the list of valid securityType values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_security_type(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("securityType.json");
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached securityType values").await;
return Ok(());
}
logger::log_info(" Fetching securityType values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/securityType")
.send()
.await
.context("Failed to fetch securityType values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse securityType response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write securityType cache")?;
logger::log_info(" ✓ Cached securityType values").await;
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
#[derive(Debug)]
pub struct MappingStats {
pub total_leis: usize,
pub mapped_leis: usize,
pub no_result_leis: usize,
pub unqueried_leis: usize,
pub mapping_percentage: f64,
pub queried_percentage: f64,
pub by_sector: HashMap<String, usize>,
}
/// Get detailed statistics about LEI-FIGI mapping status
pub async fn get_mapping_stats(
csv_path: &str,
gleif_date: Option<&str>,
) -> anyhow::Result<MappingStats> {
let dir = DataPaths::new(".")?;
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
let date = determine_gleif_date(gleif_date, &dir).await?;
let date_dir = map_cache_dir.join(&date);
let all_leis = get_all_leis_from_gleif(csv_path).await?;
let mapped_leis = load_existing_mapped_leis(&date_dir).await?;
let no_result_leis = load_no_result_leis(&date_dir).await?;
let total = all_leis.len();
let mapped = mapped_leis.len();
let no_results = no_result_leis.len();
let queried = mapped + no_results;
let unqueried = total.saturating_sub(queried);
let mapping_percentage = if total > 0 {
(mapped as f64 / total as f64) * 100.0
} else {
0.0
};
let queried_percentage = if total > 0 {
(queried as f64 / total as f64) * 100.0
} else {
0.0
};
// Count by sector
let mut by_sector = HashMap::new();
if date_dir.exists() {
let mut entries = tokio_fs::read_dir(&date_dir).await?;
while let Some(entry) = entries.next_entry().await? {
let sector_path = entry.path();
if !sector_path.is_dir() {
continue;
}
let sector_name = sector_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown")
.to_string();
let jsonl_path = sector_path.join("lei_to_figi.jsonl");
if !jsonl_path.exists() {
continue;
}
let content = tokio_fs::read_to_string(&jsonl_path).await?;
let count = content.lines().filter(|l| !l.trim().is_empty()).count();
by_sector.insert(sector_name, count);
}
}
Ok(MappingStats {
total_leis: total,
mapped_leis: mapped,
no_result_leis: no_results,
unqueried_leis: unqueried,
mapping_percentage,
queried_percentage,
by_sector,
})
}
/// Print mapping statistics to console and logs
pub async fn print_mapping_stats(csv_path: &str) -> anyhow::Result<()> {
logger::log_info("=== LEI-FIGI Mapping Status ===").await;
let stats = get_mapping_stats(csv_path, None).await?;
logger::log_info(&format!(
"Total LEIs: {}",
stats.total_leis
)).await;
logger::log_info(&format!(
"├─ Mapped (with FIGI): {} ({:.2}%)",
stats.mapped_leis,
stats.mapping_percentage
)).await;
logger::log_info(&format!(
"├─ No Results (queried, no FIGI): {} ({:.2}%)",
stats.no_result_leis,
(stats.no_result_leis as f64 / stats.total_leis as f64) * 100.0
)).await;
logger::log_info(&format!(
"└─ Not Queried Yet: {} ({:.2}%)",
stats.unqueried_leis,
(stats.unqueried_leis as f64 / stats.total_leis as f64) * 100.0
)).await;
logger::log_info(&format!(
"\nQuery Coverage: {:.2}% ({} / {})",
stats.queried_percentage,
stats.mapped_leis + stats.no_result_leis,
stats.total_leis
)).await;
if !stats.by_sector.is_empty() {
logger::log_info("\nMapped LEIs by sector:").await;
let mut sectors: Vec<_> = stats.by_sector.iter().collect();
sectors.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
for (sector, count) in sectors {
logger::log_info(&format!(" {}: {}", sector, count)).await;
}
}
logger::log_info("==============================").await;
Ok(())
}
/// Quick check if mapping is complete (returns true if all mapped)
pub async fn is_mapping_complete(csv_path: &str) -> anyhow::Result<bool> {
let dir = DataPaths::new(".")?;
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
let date = determine_gleif_date(None, &dir).await?;
let date_dir = map_cache_dir.join(&date);
let unmapped = get_unmapped_leis(csv_path, &date_dir).await?;
Ok(unmapped.is_empty())
}
/// Load all LEIs that have already been mapped from existing JSONL files
async fn load_existing_mapped_leis(date_dir: &Path) -> anyhow::Result<HashSet<String>> {
let mut mapped_leis = HashSet::new();
if !date_dir.exists() {
return Ok(mapped_leis);
}
// Read all sector directories
let mut entries = tokio_fs::read_dir(date_dir).await?;
while let Some(entry) = entries.next_entry().await? {
let sector_path = entry.path();
if !sector_path.is_dir() {
continue;
}
let jsonl_path = sector_path.join("lei_to_figi.jsonl");
if !jsonl_path.exists() {
continue;
}
// Read JSONL file line by line
let content = tokio_fs::read_to_string(&jsonl_path).await?;
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(entry) = serde_json::from_str::<Value>(line) {
if let Some(lei) = entry["lei"].as_str() {
mapped_leis.insert(lei.to_string());
}
}
}
}
if !mapped_leis.is_empty() {
logger::log_info(&format!("Found {} already mapped LEIs", mapped_leis.len())).await;
}
Ok(mapped_leis)
}
/// Read GLEIF CSV and return all LEIs (without loading entire file into memory)
async fn get_all_leis_from_gleif(csv_path: &str) -> anyhow::Result<HashSet<String>> {
let file = std::fs::File::open(csv_path)?;
let reader = BufReader::new(file);
let mut all_leis = HashSet::new();
for (idx, line) in reader.lines().enumerate() {
if idx == 0 {
continue; // Skip header
}
let line = line?;
let parts: Vec<&str> = line.split(',').collect();
if parts.len() < 2 {
continue;
}
let lei = parts[0].trim().trim_matches('"').to_string();
if !lei.is_empty() {
all_leis.insert(lei);
}
}
logger::log_info(&format!("Found {} total LEIs in GLEIF CSV", all_leis.len())).await;
Ok(all_leis)
}
/// Get unmapped LEIs by comparing GLEIF CSV with existing mappings
async fn get_unmapped_leis(
csv_path: &str,
date_dir: &Path,
) -> anyhow::Result<HashSet<String>> {
let all_leis = get_all_leis_from_gleif(csv_path).await?;
let mapped_leis = load_existing_mapped_leis(date_dir).await?;
let no_result_leis = load_no_result_leis(date_dir).await?;
// Calculate truly unmapped: all - (mapped + no_results)
let queried_leis: HashSet<String> = mapped_leis
.union(&no_result_leis)
.cloned()
.collect();
let unmapped: HashSet<String> = all_leis
.difference(&queried_leis)
.cloned()
.collect();
let total = all_leis.len();
let mapped = mapped_leis.len();
let no_results = no_result_leis.len();
let unqueried = unmapped.len();
logger::log_info(&format!(
"LEI Status: Total={}, Mapped={}, No Results={}, Unqueried={}",
total, mapped, no_results, unqueried
)).await;
Ok(unmapped)
}
/// Modified version that only processes specified LEIs
pub async fn stream_gleif_csv_and_build_figi_filtered(
csv_path: &str,
gleif_date: Option<&str>,
filter_leis: Option<&HashSet<String>>,
) -> anyhow::Result<()> {
logger::log_info(&format!("Streaming GLEIF CSV: {}", csv_path)).await;
let file = std::fs::File::open(csv_path)?;
let reader = BufReader::new(file);
let client = OpenFigiClient::new().await?;
if !client.has_key {
logger::log_warn("No API key - skipping FIGI mapping").await;
return Ok(());
}
let dir = DataPaths::new(".")?;
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
let date = determine_gleif_date(gleif_date, &dir).await?;
let date_dir = map_cache_dir.join(&date);
tokio_fs::create_dir_all(&date_dir).await?;
let sector_dirs = load_market_sectors().await?;
setup_sector_directories(&date_dir, &sector_dirs).await?;
let mut lei_batch: HashMap<String, Vec<String>> = HashMap::new();
let mut line_count = 0;
let mut processed_leis = 0;
let mut skipped_leis = 0;
for (idx, line) in reader.lines().enumerate() {
let line = line?;
if idx == 0 { continue; }
let parts: Vec<&str> = line.split(',').collect();
if parts.len() < 2 { continue; }
let lei = parts[0].trim().trim_matches('"').to_string();
let isin = parts[1].trim().trim_matches('"').to_string();
if lei.is_empty() || isin.is_empty() {
continue;
}
// Apply filter if provided
if let Some(filter) = filter_leis {
if !filter.contains(&lei) {
skipped_leis += 1;
continue;
}
}
lei_batch.entry(lei).or_default().push(isin);
line_count += 1;
// Process batch when full
if lei_batch.len() >= LEI_BATCH_SIZE {
process_and_save_figi_batch(&client, &lei_batch, &date_dir).await?;
processed_leis += lei_batch.len();
if processed_leis % 1000 == 0 {
logger::log_info(&format!("Queried {} LEIs...", processed_leis)).await;
}
lei_batch.clear();
tokio::task::yield_now().await;
}
}
// Process remaining
if !lei_batch.is_empty() {
process_and_save_figi_batch(&client, &lei_batch, &date_dir).await?;
processed_leis += lei_batch.len();
}
logger::log_info(&format!(
"✓ Queried {} LEIs, skipped {} already processed",
processed_leis,
skipped_leis
)).await;
Ok(())
}
/// Check mapping completion and process only unmapped LEIs
pub async fn ensure_all_leis_mapped(
csv_path: &str,
gleif_date: Option<&str>,
) -> anyhow::Result<bool> {
let dir = DataPaths::new(".")?;
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
let date = determine_gleif_date(gleif_date, &dir).await?;
let date_dir = map_cache_dir.join(&date);
// Get unmapped LEIs (excludes both mapped and no-result LEIs)
let unmapped = get_unmapped_leis(csv_path, &date_dir).await?;
if unmapped.is_empty() {
logger::log_info("✓ All LEIs have been queried (mapped or confirmed no results)").await;
return Ok(true);
}
logger::log_info(&format!("Found {} LEIs that need querying - starting mapping...", unmapped.len())).await;
// Process only unmapped LEIs
stream_gleif_csv_and_build_figi_filtered(csv_path, gleif_date, Some(&unmapped)).await?;
// Verify completion
let still_unmapped = get_unmapped_leis(csv_path, &date_dir).await?;
if still_unmapped.is_empty() {
logger::log_info("✓ All LEIs successfully queried").await;
Ok(true)
} else {
logger::log_warn(&format!(
"{} LEIs still unqueried (API errors or rate limits)",
still_unmapped.len()
)).await;
Ok(false)
}
}
/// Load LEIs that were queried but returned no results
async fn load_no_result_leis(date_dir: &Path) -> anyhow::Result<HashSet<String>> {
let mut no_result_leis = HashSet::new();
let no_results_path = date_dir.join("no_results.jsonl");
if !no_results_path.exists() {
return Ok(no_result_leis);
}
let content = tokio_fs::read_to_string(&no_results_path).await?;
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(entry) = serde_json::from_str::<Value>(line) {
if let Some(lei) = entry["lei"].as_str() {
no_result_leis.insert(lei.to_string());
}
}
}
if !no_result_leis.is_empty() {
logger::log_info(&format!(
"Found {} LEIs previously queried with no FIGI results",
no_result_leis.len()
)).await;
}
Ok(no_result_leis)
}
/// Save LEI that was queried but returned no results
async fn append_no_result_lei(date_dir: &Path, lei: &str, isins: &[String]) -> anyhow::Result<()> {
let no_results_path = date_dir.join("no_results.jsonl");
let entry = json!({
"lei": lei,
"isins": isins,
"queried_at": chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
});
let line = serde_json::to_string(&entry)? + "\n";
let mut file = tokio_fs::OpenOptions::new()
.create(true)
.append(true)
.open(&no_results_path)
.await?;
file.write_all(line.as_bytes()).await?;
Ok(())
}