1122 lines
35 KiB
Rust
1122 lines
35 KiB
Rust
// src/corporate/openfigi.rs - STREAMING VERSION
|
|
// Key changes: Never load entire GLEIF CSV or FIGI maps into memory
|
|
|
|
use crate::util::directories::DataPaths;
|
|
use crate::util::logger;
|
|
use super::types::*;
|
|
use reqwest::Client as HttpClient;
|
|
use reqwest::header::{HeaderMap, HeaderValue};
|
|
use serde_json::{json, Value};
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::path::Path;
|
|
use std::io::{BufRead, BufReader};
|
|
use tokio::time::{sleep, Duration};
|
|
use tokio::fs as tokio_fs;
|
|
use tokio::io::AsyncWriteExt;
|
|
use anyhow::{Context, anyhow};
|
|
|
|
const LEI_BATCH_SIZE: usize = 100; // Process 100 LEIs at a time
|
|
|
|
#[derive(Clone)]
|
|
pub struct OpenFigiClient {
|
|
client: HttpClient,
|
|
has_key: bool,
|
|
}
|
|
|
|
impl OpenFigiClient {
|
|
pub async fn new() -> anyhow::Result<Self> {
|
|
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
|
|
let has_key = api_key.is_some();
|
|
|
|
let mut builder = HttpClient::builder()
|
|
.user_agent("Mozilla/5.0 (compatible; OpenFIGI-Rust/1.0)")
|
|
.timeout(Duration::from_secs(30));
|
|
|
|
if let Some(key) = &api_key {
|
|
let mut headers = HeaderMap::new();
|
|
headers.insert("X-OPENFIGI-APIKEY", HeaderValue::from_str(key)?);
|
|
builder = builder.default_headers(headers);
|
|
}
|
|
|
|
let client = builder.build().context("Failed to build HTTP client")?;
|
|
logger::log_info(&format!("OpenFIGI client: {}",
|
|
if has_key { "with API key" } else { "no key" })).await;
|
|
|
|
Ok(Self { client, has_key })
|
|
}
|
|
|
|
pub async fn map_isins_to_figi_infos(&self, isins: &[String]) -> anyhow::Result<Vec<FigiInfo>> {
|
|
if isins.is_empty() {
|
|
return Ok(vec![]);
|
|
}
|
|
|
|
let mut all_figi_infos = Vec::new();
|
|
let chunk_size = if self.has_key { 100 } else { 5 };
|
|
let inter_sleep = if self.has_key {
|
|
Duration::from_millis(240)
|
|
} else {
|
|
Duration::from_millis(2400)
|
|
};
|
|
|
|
for chunk in isins.chunks(chunk_size) {
|
|
let jobs: Vec<Value> = chunk.iter()
|
|
.map(|isin| json!({
|
|
"idType": "ID_ISIN",
|
|
"idValue": isin,
|
|
}))
|
|
.collect();
|
|
|
|
let mut retry_count = 0;
|
|
let max_retries = 5;
|
|
let mut backoff_ms = 1000u64;
|
|
|
|
loop {
|
|
let resp_result = self.client
|
|
.post("https://api.openfigi.com/v3/mapping")
|
|
.header("Content-Type", "application/json")
|
|
.json(&jobs)
|
|
.send()
|
|
.await;
|
|
|
|
let resp = match resp_result {
|
|
Ok(r) => r,
|
|
Err(e) => {
|
|
retry_count += 1;
|
|
if retry_count >= max_retries {
|
|
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
|
|
logger::log_error(&err_msg).await;
|
|
return Err(anyhow!(err_msg));
|
|
}
|
|
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
|
|
logger::log_warn(&warn_msg).await;
|
|
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
|
|
logger::log_info(&retry_msg).await;
|
|
sleep(Duration::from_millis(backoff_ms)).await;
|
|
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
|
|
continue;
|
|
}
|
|
};
|
|
|
|
let status = resp.status();
|
|
let headers = resp.headers().clone();
|
|
let body = resp.text().await?;
|
|
|
|
if status == 429 {
|
|
let reset_sec = headers
|
|
.get("ratelimit-reset")
|
|
.and_then(|v| v.to_str().ok())
|
|
.and_then(|s| s.parse::<u64>().ok())
|
|
.unwrap_or(10);
|
|
sleep(Duration::from_secs(reset_sec.max(10))).await;
|
|
continue;
|
|
} else if !status.is_success() {
|
|
if status.is_server_error() && retry_count < max_retries {
|
|
retry_count += 1;
|
|
sleep(Duration::from_millis(backoff_ms)).await;
|
|
backoff_ms = (backoff_ms * 2).min(60000);
|
|
continue;
|
|
}
|
|
return Err(anyhow!("OpenFIGI error {}: {}", status, body));
|
|
}
|
|
|
|
let results: Vec<Value> = serde_json::from_str(&body)?;
|
|
|
|
for (isin, result) in chunk.iter().zip(results) {
|
|
if let Some(data) = result["data"].as_array() {
|
|
for item in data {
|
|
if let Some(figi) = item["figi"].as_str() {
|
|
all_figi_infos.push(FigiInfo {
|
|
isin: isin.clone(),
|
|
figi: figi.to_string(),
|
|
name: item["name"].as_str().unwrap_or("").to_string(),
|
|
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
|
|
exch_code: item["exchCode"].as_str().unwrap_or("").to_string(),
|
|
composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
|
|
security_type: item["securityType"].as_str().unwrap_or("").to_string(),
|
|
market_sector: item["marketSector"].as_str().unwrap_or("").to_string(),
|
|
share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
|
|
security_type2: item["securityType2"].as_str().unwrap_or("").to_string(),
|
|
security_description: item["securityDescription"].as_str().unwrap_or("").to_string(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
sleep(inter_sleep).await;
|
|
}
|
|
|
|
Ok(all_figi_infos)
|
|
}
|
|
}
|
|
|
|
async fn process_and_save_figi_batch(
|
|
client: &OpenFigiClient,
|
|
lei_batch: &HashMap<String, Vec<String>>,
|
|
date_dir: &Path,
|
|
) -> anyhow::Result<()> {
|
|
for (lei, isins) in lei_batch {
|
|
let unique_isins: Vec<_> = isins.iter()
|
|
.cloned()
|
|
.collect::<HashSet<_>>()
|
|
.into_iter()
|
|
.collect();
|
|
|
|
let figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
|
|
|
|
if figi_infos.is_empty() {
|
|
// No FIGIs found - save to no_results.jsonl to avoid re-querying
|
|
append_no_result_lei(date_dir, lei, &unique_isins).await?;
|
|
continue;
|
|
}
|
|
|
|
// Save FIGIs by sector as before
|
|
save_figi_infos_by_sector(lei, &figi_infos, date_dir).await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn save_figi_infos_by_sector(
|
|
lei: &str,
|
|
figi_infos: &[FigiInfo],
|
|
date_dir: &Path,
|
|
) -> anyhow::Result<()> {
|
|
let mut by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
|
|
|
|
for figi_info in figi_infos {
|
|
let sector = if figi_info.market_sector.is_empty() {
|
|
"uncategorized".to_string()
|
|
} else {
|
|
figi_info.market_sector.clone()
|
|
};
|
|
by_sector.entry(sector).or_default().push(figi_info.clone());
|
|
}
|
|
|
|
for (sector, figis) in by_sector {
|
|
let sector_dir = date_dir.join(§or);
|
|
let path = sector_dir.join("lei_to_figi.jsonl");
|
|
append_lei_to_figi_jsonl(&path, lei, &figis).await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> {
|
|
let entry = json!({
|
|
"lei": lei,
|
|
"figis": figis,
|
|
});
|
|
|
|
let line = serde_json::to_string(&entry)? + "\n";
|
|
|
|
let mut file = tokio_fs::OpenOptions::new()
|
|
.create(true)
|
|
.append(true)
|
|
.open(path)
|
|
.await?;
|
|
|
|
file.write_all(line.as_bytes()).await?;
|
|
Ok(())
|
|
}
|
|
|
|
/// STREAMING: Build securities without loading everything into memory
|
|
pub async fn build_securities_from_figi_streaming(
|
|
date_dir: &Path,
|
|
) -> anyhow::Result<()> {
|
|
logger::log_info("Building securities (streaming mode)...").await;
|
|
|
|
// Load existing incrementally
|
|
let mut commons = load_from_cache_if_exists::<HashMap<String, CompanyInfo>>(
|
|
"data/corporate/by_name/common_stocks.json"
|
|
).await?;
|
|
|
|
let equity_file = date_dir.join("Equity").join("lei_to_figi.jsonl");
|
|
|
|
if !equity_file.exists() {
|
|
logger::log_warn("No Equity FIGI file found").await;
|
|
return Ok(());
|
|
}
|
|
|
|
let content = tokio_fs::read_to_string(&equity_file).await?;
|
|
let mut processed = 0;
|
|
let mut stats = ProcessingStats::new(commons.len(), 0, 0);
|
|
|
|
for line in content.lines() {
|
|
if line.trim().is_empty() {
|
|
continue;
|
|
}
|
|
|
|
let entry: Value = serde_json::from_str(line)?;
|
|
let figi_infos: Vec<FigiInfo> = serde_json::from_value(entry["figis"].clone())?;
|
|
|
|
// Process only common stocks
|
|
let common_stocks: Vec<_> = figi_infos.iter()
|
|
.filter(|f| f.security_type == "Common Stock")
|
|
.cloned()
|
|
.collect();
|
|
|
|
if !common_stocks.is_empty() {
|
|
process_common_stocks(&mut commons, &common_stocks, &mut stats);
|
|
}
|
|
|
|
processed += 1;
|
|
if processed % 100 == 0 {
|
|
tokio::task::yield_now().await;
|
|
}
|
|
}
|
|
|
|
logger::log_info(&format!("Processed {} FIGI entries", processed)).await;
|
|
save_to_cache("data/corporate/by_name/common_stocks.json", &commons).await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Handles rate limit responses from the OpenFIGI API.
|
|
///
|
|
/// If a 429 status is received, this function sleeps for the duration specified
|
|
/// in the `ratelimit-reset` header (or 10 seconds by default).
|
|
///
|
|
/// # Arguments
|
|
/// * `resp` - The HTTP response to check.
|
|
///
|
|
/// # Returns
|
|
/// Ok(()) if no rate limit, or after waiting for the reset period.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the response status indicates a non-rate-limit error.
|
|
async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> {
|
|
let status = resp.status();
|
|
|
|
if status == 429 {
|
|
let headers = resp.headers();
|
|
let reset_sec = headers
|
|
.get("ratelimit-reset")
|
|
.and_then(|v| v.to_str().ok())
|
|
.and_then(|s| s.parse::<u64>().ok())
|
|
.unwrap_or(10);
|
|
|
|
logger::log_info(&format!(" Rate limited—waiting {}s", reset_sec)).await;
|
|
sleep(std::time::Duration::from_secs(reset_sec.max(10))).await;
|
|
|
|
return Err(anyhow!("Rate limited, please retry"));
|
|
} else if status.is_client_error() || status.is_server_error() {
|
|
return Err(anyhow!("OpenFIGI API error: {}", status));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn process_common_stocks(
|
|
companies: &mut HashMap<String, CompanyInfo>,
|
|
figi_infos: &[FigiInfo],
|
|
stats: &mut ProcessingStats,
|
|
) {
|
|
let name = figi_infos[0].name.clone();
|
|
if name.is_empty() {
|
|
return;
|
|
}
|
|
|
|
let grouped_by_isin = group_by_isin(figi_infos);
|
|
|
|
if let Some(existing) = companies.get_mut(&name) {
|
|
let mut updated = false;
|
|
for (isin, new_figis) in grouped_by_isin {
|
|
if let Some(existing_figis) = existing.securities.get_mut(&isin) {
|
|
let merged = merge_figi_list(existing_figis, &new_figis);
|
|
if merged.len() > existing_figis.len() {
|
|
*existing_figis = merged;
|
|
updated = true;
|
|
}
|
|
} else {
|
|
existing.securities.insert(isin.clone(), new_figis);
|
|
updated = true;
|
|
}
|
|
}
|
|
|
|
if existing.primary_isin.is_empty() {
|
|
if let Some(first_isin) = existing.securities.keys().next() {
|
|
existing.primary_isin = first_isin.clone();
|
|
}
|
|
}
|
|
|
|
if updated {
|
|
stats.companies_updated += 1;
|
|
}
|
|
} else {
|
|
let primary_isin = grouped_by_isin.keys().next().cloned().unwrap_or_default();
|
|
|
|
companies.insert(name.clone(), CompanyInfo {
|
|
name,
|
|
primary_isin,
|
|
securities: grouped_by_isin,
|
|
});
|
|
|
|
stats.companies_added += 1;
|
|
}
|
|
}
|
|
|
|
fn group_by_isin(figi_infos: &[FigiInfo]) -> HashMap<String, Vec<FigiInfo>> {
|
|
let mut grouped: HashMap<String, Vec<FigiInfo>> = HashMap::new();
|
|
|
|
for figi_info in figi_infos {
|
|
grouped.entry(figi_info.isin.clone())
|
|
.or_insert_with(Vec::new)
|
|
.push(figi_info.clone());
|
|
}
|
|
|
|
for figis in grouped.values_mut() {
|
|
figis.sort_by(|a, b| a.figi.cmp(&b.figi));
|
|
}
|
|
|
|
grouped
|
|
}
|
|
|
|
fn merge_figi_list(existing: &[FigiInfo], new_figis: &[FigiInfo]) -> Vec<FigiInfo> {
|
|
let mut merged = existing.to_vec();
|
|
let existing_figis: HashSet<String> = existing.iter()
|
|
.map(|f| f.figi.clone())
|
|
.collect();
|
|
|
|
for new_figi in new_figis {
|
|
if !existing_figis.contains(&new_figi.figi) {
|
|
merged.push(new_figi.clone());
|
|
}
|
|
}
|
|
|
|
merged.sort_by(|a, b| a.figi.cmp(&b.figi));
|
|
merged
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct ProcessingStats {
|
|
initial_companies: usize,
|
|
companies_added: usize,
|
|
companies_updated: usize,
|
|
}
|
|
|
|
impl ProcessingStats {
|
|
fn new(companies: usize, _warrants: usize, _options: usize) -> Self {
|
|
Self {
|
|
initial_companies: companies,
|
|
companies_added: 0,
|
|
companies_updated: 0,
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn load_from_cache_if_exists<T>(path: &str) -> anyhow::Result<T>
|
|
where
|
|
T: serde::de::DeserializeOwned + Default,
|
|
{
|
|
let cache_file = Path::new(path);
|
|
|
|
if !cache_file.exists() {
|
|
return Ok(T::default());
|
|
}
|
|
|
|
let content = tokio_fs::read_to_string(cache_file).await?;
|
|
Ok(serde_json::from_str(&content)?)
|
|
}
|
|
|
|
async fn save_to_cache<T>(path: &str, data: &T) -> anyhow::Result<()>
|
|
where
|
|
T: serde::Serialize,
|
|
{
|
|
let cache_path = Path::new(path);
|
|
let cache_dir = cache_path.parent().context("Invalid path")?;
|
|
|
|
tokio_fs::create_dir_all(cache_dir).await?;
|
|
let json_str = serde_json::to_string_pretty(data)?;
|
|
tokio_fs::write(cache_path, json_str).await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
|
|
let dir = DataPaths::new(".")?;
|
|
let cache_file = dir.cache_openfigi_dir().join("marketSecDes.json");
|
|
|
|
if !cache_file.exists() {
|
|
return Ok(vec![
|
|
"Comdty".to_string(),
|
|
"Corp".to_string(),
|
|
"Equity".to_string(),
|
|
"Govt".to_string(),
|
|
]);
|
|
}
|
|
|
|
let content = tokio_fs::read_to_string(&cache_file).await?;
|
|
let json: Value = serde_json::from_str(&content)?;
|
|
|
|
let sectors: Vec<String> = json["values"]
|
|
.as_array()
|
|
.ok_or_else(|| anyhow!("No values"))?
|
|
.iter()
|
|
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
|
.collect();
|
|
|
|
Ok(sectors)
|
|
}
|
|
|
|
async fn determine_gleif_date(
|
|
gleif_date: Option<&str>,
|
|
paths: &DataPaths,
|
|
) -> anyhow::Result<String> {
|
|
if let Some(d) = gleif_date {
|
|
return Ok(d.to_string());
|
|
}
|
|
|
|
let gleif_dir = paths.cache_gleif_dir();
|
|
let mut entries = tokio_fs::read_dir(gleif_dir).await?;
|
|
let mut dates = Vec::new();
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
let path = entry.path();
|
|
if path.is_dir() {
|
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
|
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
|
|
dates.push(name.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
dates.sort();
|
|
dates.last().cloned().ok_or_else(|| anyhow!("No GLEIF date found"))
|
|
}
|
|
|
|
async fn setup_sector_directories(
|
|
date_dir: &Path,
|
|
sector_dirs: &[String],
|
|
) -> anyhow::Result<()> {
|
|
let uncategorized_dir = date_dir.join("uncategorized");
|
|
tokio_fs::create_dir_all(&uncategorized_dir).await?;
|
|
|
|
for sector in sector_dirs {
|
|
let sector_dir = date_dir.join(sector);
|
|
tokio_fs::create_dir_all(§or_dir).await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Loads all OpenFIGI mapping value lists (marketSecDes, micCode, securityType).
|
|
///
|
|
/// This function fetches the available values for each mapping parameter from the OpenFIGI API
|
|
/// and caches them as JSON files in `data/openfigi/`. If the files already exist and are recent
|
|
/// (less than 30 days old), they are reused instead of re-fetching.
|
|
///
|
|
/// # Returns
|
|
/// Ok(()) on success.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if API requests fail, JSON parsing fails, or file I/O fails.
|
|
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
|
|
logger::log_info("Loading OpenFIGI mapping value lists...").await;
|
|
|
|
let client = OpenFigiClient::new().await?;
|
|
|
|
// Create cache directory
|
|
let dir = DataPaths::new(".")?;
|
|
let cache_dir = dir.cache_openfigi_dir();
|
|
tokio_fs::create_dir_all(cache_dir).await
|
|
.context("Failed to create data/openfigi directory")?;
|
|
|
|
// Fetch each type list
|
|
get_figi_market_sec_des(&client, cache_dir).await?;
|
|
get_figi_mic_code(&client, cache_dir).await?;
|
|
get_figi_security_type(&client, cache_dir).await?;
|
|
|
|
logger::log_info("OpenFIGI mapping value lists loaded successfully").await;
|
|
Ok(())
|
|
}
|
|
|
|
/// Fetches and caches the list of valid marketSecDes values.
|
|
///
|
|
/// # Arguments
|
|
/// * `client` - The OpenFIGI client instance.
|
|
/// * `cache_dir` - Directory to save the cached JSON file.
|
|
///
|
|
/// # Returns
|
|
/// Ok(()) on success.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the API request fails or file I/O fails.
|
|
async fn get_figi_market_sec_des(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
|
|
let cache_file = cache_dir.join("marketSecDes.json");
|
|
|
|
// Check if cache exists and is recent (< 30 days old)
|
|
if should_use_cache(&cache_file).await? {
|
|
logger::log_info(" Using cached marketSecDes values").await;
|
|
return Ok(());
|
|
}
|
|
|
|
logger::log_info(" Fetching marketSecDes values from OpenFIGI API...").await;
|
|
|
|
let resp = client.client
|
|
.get("https://api.openfigi.com/v3/mapping/values/marketSecDes")
|
|
.send()
|
|
.await
|
|
.context("Failed to fetch marketSecDes values")?;
|
|
|
|
handle_rate_limit(&resp).await?;
|
|
|
|
let values: Value = resp.json().await
|
|
.context("Failed to parse marketSecDes response")?;
|
|
|
|
// Save to cache
|
|
let json_str = serde_json::to_string_pretty(&values)?;
|
|
tokio_fs::write(&cache_file, json_str).await
|
|
.context("Failed to write marketSecDes cache")?;
|
|
|
|
logger::log_info(" ✓ Cached marketSecDes values").await;
|
|
|
|
// Respect rate limits
|
|
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Fetches and caches the list of valid micCode values.
|
|
///
|
|
/// # Arguments
|
|
/// * `client` - The OpenFIGI client instance.
|
|
/// * `cache_dir` - Directory to save the cached JSON file.
|
|
///
|
|
/// # Returns
|
|
/// Ok(()) on success.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the API request fails or file I/O fails.
|
|
async fn get_figi_mic_code(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
|
|
let cache_file = cache_dir.join("micCode.json");
|
|
|
|
if should_use_cache(&cache_file).await? {
|
|
logger::log_info(" Using cached micCode values").await;
|
|
return Ok(());
|
|
}
|
|
|
|
logger::log_info(" Fetching micCode values from OpenFIGI API...").await;
|
|
|
|
let resp = client.client
|
|
.get("https://api.openfigi.com/v3/mapping/values/micCode")
|
|
.send()
|
|
.await
|
|
.context("Failed to fetch micCode values")?;
|
|
|
|
handle_rate_limit(&resp).await?;
|
|
|
|
let values: Value = resp.json().await
|
|
.context("Failed to parse micCode response")?;
|
|
|
|
let json_str = serde_json::to_string_pretty(&values)?;
|
|
tokio_fs::write(&cache_file, json_str).await
|
|
.context("Failed to write micCode cache")?;
|
|
|
|
logger::log_info(" ✓ Cached micCode values").await;
|
|
|
|
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Checks if a cache file exists and is less than 30 days old.
|
|
///
|
|
/// # Arguments
|
|
/// * `path` - Path to the cache file.
|
|
///
|
|
/// # Returns
|
|
/// True if the cache should be used, false if it needs refreshing.
|
|
async fn should_use_cache(path: &Path) -> anyhow::Result<bool> {
|
|
if !path.exists() {
|
|
return Ok(false);
|
|
}
|
|
|
|
let metadata = tokio_fs::metadata(path).await?;
|
|
let modified = metadata.modified()?;
|
|
let age = modified.elapsed().unwrap_or(std::time::Duration::from_secs(u64::MAX));
|
|
|
|
// Cache is valid for 30 days
|
|
Ok(age < std::time::Duration::from_secs(30 * 24 * 60 * 60))
|
|
}
|
|
|
|
/// Fetches and caches the list of valid securityType values.
|
|
///
|
|
/// # Arguments
|
|
/// * `client` - The OpenFIGI client instance.
|
|
/// * `cache_dir` - Directory to save the cached JSON file.
|
|
///
|
|
/// # Returns
|
|
/// Ok(()) on success.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the API request fails or file I/O fails.
|
|
async fn get_figi_security_type(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
|
|
let cache_file = cache_dir.join("securityType.json");
|
|
|
|
if should_use_cache(&cache_file).await? {
|
|
logger::log_info(" Using cached securityType values").await;
|
|
return Ok(());
|
|
}
|
|
|
|
logger::log_info(" Fetching securityType values from OpenFIGI API...").await;
|
|
|
|
let resp = client.client
|
|
.get("https://api.openfigi.com/v3/mapping/values/securityType")
|
|
.send()
|
|
.await
|
|
.context("Failed to fetch securityType values")?;
|
|
|
|
handle_rate_limit(&resp).await?;
|
|
|
|
let values: Value = resp.json().await
|
|
.context("Failed to parse securityType response")?;
|
|
|
|
let json_str = serde_json::to_string_pretty(&values)?;
|
|
tokio_fs::write(&cache_file, json_str).await
|
|
.context("Failed to write securityType cache")?;
|
|
|
|
logger::log_info(" ✓ Cached securityType values").await;
|
|
|
|
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct MappingStats {
|
|
pub total_leis: usize,
|
|
pub mapped_leis: usize,
|
|
pub no_result_leis: usize,
|
|
pub unqueried_leis: usize,
|
|
pub mapping_percentage: f64,
|
|
pub queried_percentage: f64,
|
|
pub by_sector: HashMap<String, usize>,
|
|
}
|
|
|
|
/// Get detailed statistics about LEI-FIGI mapping status
|
|
pub async fn get_mapping_stats(
|
|
csv_path: &str,
|
|
gleif_date: Option<&str>,
|
|
) -> anyhow::Result<MappingStats> {
|
|
let dir = DataPaths::new(".")?;
|
|
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
|
|
|
|
let date = determine_gleif_date(gleif_date, &dir).await?;
|
|
let date_dir = map_cache_dir.join(&date);
|
|
|
|
let all_leis = get_all_leis_from_gleif(csv_path).await?;
|
|
let mapped_leis = load_existing_mapped_leis(&date_dir).await?;
|
|
let no_result_leis = load_no_result_leis(&date_dir).await?;
|
|
|
|
let total = all_leis.len();
|
|
let mapped = mapped_leis.len();
|
|
let no_results = no_result_leis.len();
|
|
let queried = mapped + no_results;
|
|
let unqueried = total.saturating_sub(queried);
|
|
|
|
let mapping_percentage = if total > 0 {
|
|
(mapped as f64 / total as f64) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
let queried_percentage = if total > 0 {
|
|
(queried as f64 / total as f64) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
// Count by sector
|
|
let mut by_sector = HashMap::new();
|
|
|
|
if date_dir.exists() {
|
|
let mut entries = tokio_fs::read_dir(&date_dir).await?;
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
let sector_path = entry.path();
|
|
if !sector_path.is_dir() {
|
|
continue;
|
|
}
|
|
|
|
let sector_name = sector_path
|
|
.file_name()
|
|
.and_then(|n| n.to_str())
|
|
.unwrap_or("unknown")
|
|
.to_string();
|
|
|
|
let jsonl_path = sector_path.join("lei_to_figi.jsonl");
|
|
if !jsonl_path.exists() {
|
|
continue;
|
|
}
|
|
|
|
let content = tokio_fs::read_to_string(&jsonl_path).await?;
|
|
let count = content.lines().filter(|l| !l.trim().is_empty()).count();
|
|
by_sector.insert(sector_name, count);
|
|
}
|
|
}
|
|
|
|
Ok(MappingStats {
|
|
total_leis: total,
|
|
mapped_leis: mapped,
|
|
no_result_leis: no_results,
|
|
unqueried_leis: unqueried,
|
|
mapping_percentage,
|
|
queried_percentage,
|
|
by_sector,
|
|
})
|
|
}
|
|
|
|
/// Print mapping statistics to console and logs
|
|
pub async fn print_mapping_stats(csv_path: &str) -> anyhow::Result<()> {
|
|
logger::log_info("=== LEI-FIGI Mapping Status ===").await;
|
|
|
|
let stats = get_mapping_stats(csv_path, None).await?;
|
|
|
|
logger::log_info(&format!(
|
|
"Total LEIs: {}",
|
|
stats.total_leis
|
|
)).await;
|
|
|
|
logger::log_info(&format!(
|
|
"├─ Mapped (with FIGI): {} ({:.2}%)",
|
|
stats.mapped_leis,
|
|
stats.mapping_percentage
|
|
)).await;
|
|
|
|
logger::log_info(&format!(
|
|
"├─ No Results (queried, no FIGI): {} ({:.2}%)",
|
|
stats.no_result_leis,
|
|
(stats.no_result_leis as f64 / stats.total_leis as f64) * 100.0
|
|
)).await;
|
|
|
|
logger::log_info(&format!(
|
|
"└─ Not Queried Yet: {} ({:.2}%)",
|
|
stats.unqueried_leis,
|
|
(stats.unqueried_leis as f64 / stats.total_leis as f64) * 100.0
|
|
)).await;
|
|
|
|
logger::log_info(&format!(
|
|
"\nQuery Coverage: {:.2}% ({} / {})",
|
|
stats.queried_percentage,
|
|
stats.mapped_leis + stats.no_result_leis,
|
|
stats.total_leis
|
|
)).await;
|
|
|
|
if !stats.by_sector.is_empty() {
|
|
logger::log_info("\nMapped LEIs by sector:").await;
|
|
let mut sectors: Vec<_> = stats.by_sector.iter().collect();
|
|
sectors.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
|
|
|
|
for (sector, count) in sectors {
|
|
logger::log_info(&format!(" {}: {}", sector, count)).await;
|
|
}
|
|
}
|
|
|
|
logger::log_info("==============================").await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Quick check if mapping is complete (returns true if all mapped)
|
|
pub async fn is_mapping_complete(csv_path: &str) -> anyhow::Result<bool> {
|
|
let dir = DataPaths::new(".")?;
|
|
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
|
|
|
|
let date = determine_gleif_date(None, &dir).await?;
|
|
let date_dir = map_cache_dir.join(&date);
|
|
|
|
let unmapped = get_unmapped_leis(csv_path, &date_dir).await?;
|
|
Ok(unmapped.is_empty())
|
|
}
|
|
|
|
/// Load all LEIs that have already been mapped from existing JSONL files
|
|
async fn load_existing_mapped_leis(date_dir: &Path) -> anyhow::Result<HashSet<String>> {
|
|
let mut mapped_leis = HashSet::new();
|
|
|
|
if !date_dir.exists() {
|
|
return Ok(mapped_leis);
|
|
}
|
|
|
|
// Read all sector directories
|
|
let mut entries = tokio_fs::read_dir(date_dir).await?;
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
let sector_path = entry.path();
|
|
if !sector_path.is_dir() {
|
|
continue;
|
|
}
|
|
|
|
let jsonl_path = sector_path.join("lei_to_figi.jsonl");
|
|
if !jsonl_path.exists() {
|
|
continue;
|
|
}
|
|
|
|
// Read JSONL file line by line
|
|
let content = tokio_fs::read_to_string(&jsonl_path).await?;
|
|
for line in content.lines() {
|
|
if line.trim().is_empty() {
|
|
continue;
|
|
}
|
|
|
|
if let Ok(entry) = serde_json::from_str::<Value>(line) {
|
|
if let Some(lei) = entry["lei"].as_str() {
|
|
mapped_leis.insert(lei.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if !mapped_leis.is_empty() {
|
|
logger::log_info(&format!("Found {} already mapped LEIs", mapped_leis.len())).await;
|
|
}
|
|
|
|
Ok(mapped_leis)
|
|
}
|
|
|
|
/// Read GLEIF CSV and return all LEIs (without loading entire file into memory)
|
|
async fn get_all_leis_from_gleif(csv_path: &str) -> anyhow::Result<HashSet<String>> {
|
|
let file = std::fs::File::open(csv_path)?;
|
|
let reader = BufReader::new(file);
|
|
|
|
let mut all_leis = HashSet::new();
|
|
|
|
for (idx, line) in reader.lines().enumerate() {
|
|
if idx == 0 {
|
|
continue; // Skip header
|
|
}
|
|
|
|
let line = line?;
|
|
let parts: Vec<&str> = line.split(',').collect();
|
|
|
|
if parts.len() < 2 {
|
|
continue;
|
|
}
|
|
|
|
let lei = parts[0].trim().trim_matches('"').to_string();
|
|
|
|
if !lei.is_empty() {
|
|
all_leis.insert(lei);
|
|
}
|
|
}
|
|
|
|
logger::log_info(&format!("Found {} total LEIs in GLEIF CSV", all_leis.len())).await;
|
|
Ok(all_leis)
|
|
}
|
|
|
|
/// Get unmapped LEIs by comparing GLEIF CSV with existing mappings
|
|
async fn get_unmapped_leis(
|
|
csv_path: &str,
|
|
date_dir: &Path,
|
|
) -> anyhow::Result<HashSet<String>> {
|
|
let all_leis = get_all_leis_from_gleif(csv_path).await?;
|
|
let mapped_leis = load_existing_mapped_leis(date_dir).await?;
|
|
let no_result_leis = load_no_result_leis(date_dir).await?;
|
|
|
|
// Calculate truly unmapped: all - (mapped + no_results)
|
|
let queried_leis: HashSet<String> = mapped_leis
|
|
.union(&no_result_leis)
|
|
.cloned()
|
|
.collect();
|
|
|
|
let unmapped: HashSet<String> = all_leis
|
|
.difference(&queried_leis)
|
|
.cloned()
|
|
.collect();
|
|
|
|
let total = all_leis.len();
|
|
let mapped = mapped_leis.len();
|
|
let no_results = no_result_leis.len();
|
|
let unqueried = unmapped.len();
|
|
|
|
logger::log_info(&format!(
|
|
"LEI Status: Total={}, Mapped={}, No Results={}, Unqueried={}",
|
|
total, mapped, no_results, unqueried
|
|
)).await;
|
|
|
|
Ok(unmapped)
|
|
}
|
|
|
|
/// Modified version that only processes specified LEIs
|
|
pub async fn stream_gleif_csv_and_build_figi_filtered(
|
|
csv_path: &str,
|
|
gleif_date: Option<&str>,
|
|
filter_leis: Option<&HashSet<String>>,
|
|
) -> anyhow::Result<()> {
|
|
logger::log_info(&format!("Streaming GLEIF CSV: {}", csv_path)).await;
|
|
|
|
let file = std::fs::File::open(csv_path)?;
|
|
let reader = BufReader::new(file);
|
|
|
|
let client = OpenFigiClient::new().await?;
|
|
if !client.has_key {
|
|
logger::log_warn("No API key - skipping FIGI mapping").await;
|
|
return Ok(());
|
|
}
|
|
|
|
let dir = DataPaths::new(".")?;
|
|
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
|
|
|
|
let date = determine_gleif_date(gleif_date, &dir).await?;
|
|
let date_dir = map_cache_dir.join(&date);
|
|
tokio_fs::create_dir_all(&date_dir).await?;
|
|
|
|
let sector_dirs = load_market_sectors().await?;
|
|
setup_sector_directories(&date_dir, §or_dirs).await?;
|
|
|
|
let mut lei_batch: HashMap<String, Vec<String>> = HashMap::new();
|
|
let mut line_count = 0;
|
|
let mut processed_leis = 0;
|
|
let mut skipped_leis = 0;
|
|
|
|
for (idx, line) in reader.lines().enumerate() {
|
|
let line = line?;
|
|
|
|
if idx == 0 { continue; }
|
|
|
|
let parts: Vec<&str> = line.split(',').collect();
|
|
if parts.len() < 2 { continue; }
|
|
|
|
let lei = parts[0].trim().trim_matches('"').to_string();
|
|
let isin = parts[1].trim().trim_matches('"').to_string();
|
|
|
|
if lei.is_empty() || isin.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
// Apply filter if provided
|
|
if let Some(filter) = filter_leis {
|
|
if !filter.contains(&lei) {
|
|
skipped_leis += 1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
lei_batch.entry(lei).or_default().push(isin);
|
|
line_count += 1;
|
|
|
|
// Process batch when full
|
|
if lei_batch.len() >= LEI_BATCH_SIZE {
|
|
process_and_save_figi_batch(&client, &lei_batch, &date_dir).await?;
|
|
processed_leis += lei_batch.len();
|
|
|
|
if processed_leis % 1000 == 0 {
|
|
logger::log_info(&format!("Queried {} LEIs...", processed_leis)).await;
|
|
}
|
|
|
|
lei_batch.clear();
|
|
tokio::task::yield_now().await;
|
|
}
|
|
}
|
|
|
|
// Process remaining
|
|
if !lei_batch.is_empty() {
|
|
process_and_save_figi_batch(&client, &lei_batch, &date_dir).await?;
|
|
processed_leis += lei_batch.len();
|
|
}
|
|
|
|
logger::log_info(&format!(
|
|
"✓ Queried {} LEIs, skipped {} already processed",
|
|
processed_leis,
|
|
skipped_leis
|
|
)).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Check mapping completion and process only unmapped LEIs
|
|
pub async fn ensure_all_leis_mapped(
|
|
csv_path: &str,
|
|
gleif_date: Option<&str>,
|
|
) -> anyhow::Result<bool> {
|
|
let dir = DataPaths::new(".")?;
|
|
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
|
|
|
|
let date = determine_gleif_date(gleif_date, &dir).await?;
|
|
let date_dir = map_cache_dir.join(&date);
|
|
|
|
// Get unmapped LEIs (excludes both mapped and no-result LEIs)
|
|
let unmapped = get_unmapped_leis(csv_path, &date_dir).await?;
|
|
|
|
if unmapped.is_empty() {
|
|
logger::log_info("✓ All LEIs have been queried (mapped or confirmed no results)").await;
|
|
return Ok(true);
|
|
}
|
|
|
|
logger::log_info(&format!("Found {} LEIs that need querying - starting mapping...", unmapped.len())).await;
|
|
|
|
// Process only unmapped LEIs
|
|
stream_gleif_csv_and_build_figi_filtered(csv_path, gleif_date, Some(&unmapped)).await?;
|
|
|
|
// Verify completion
|
|
let still_unmapped = get_unmapped_leis(csv_path, &date_dir).await?;
|
|
|
|
if still_unmapped.is_empty() {
|
|
logger::log_info("✓ All LEIs successfully queried").await;
|
|
Ok(true)
|
|
} else {
|
|
logger::log_warn(&format!(
|
|
"⚠ {} LEIs still unqueried (API errors or rate limits)",
|
|
still_unmapped.len()
|
|
)).await;
|
|
Ok(false)
|
|
}
|
|
}
|
|
|
|
/// Load LEIs that were queried but returned no results
|
|
async fn load_no_result_leis(date_dir: &Path) -> anyhow::Result<HashSet<String>> {
|
|
let mut no_result_leis = HashSet::new();
|
|
|
|
let no_results_path = date_dir.join("no_results.jsonl");
|
|
if !no_results_path.exists() {
|
|
return Ok(no_result_leis);
|
|
}
|
|
|
|
let content = tokio_fs::read_to_string(&no_results_path).await?;
|
|
for line in content.lines() {
|
|
if line.trim().is_empty() {
|
|
continue;
|
|
}
|
|
|
|
if let Ok(entry) = serde_json::from_str::<Value>(line) {
|
|
if let Some(lei) = entry["lei"].as_str() {
|
|
no_result_leis.insert(lei.to_string());
|
|
}
|
|
}
|
|
}
|
|
|
|
if !no_result_leis.is_empty() {
|
|
logger::log_info(&format!(
|
|
"Found {} LEIs previously queried with no FIGI results",
|
|
no_result_leis.len()
|
|
)).await;
|
|
}
|
|
|
|
Ok(no_result_leis)
|
|
}
|
|
|
|
/// Save LEI that was queried but returned no results
|
|
async fn append_no_result_lei(date_dir: &Path, lei: &str, isins: &[String]) -> anyhow::Result<()> {
|
|
let no_results_path = date_dir.join("no_results.jsonl");
|
|
|
|
let entry = json!({
|
|
"lei": lei,
|
|
"isins": isins,
|
|
"queried_at": chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
|
});
|
|
|
|
let line = serde_json::to_string(&entry)? + "\n";
|
|
|
|
let mut file = tokio_fs::OpenOptions::new()
|
|
.create(true)
|
|
.append(true)
|
|
.open(&no_results_path)
|
|
.await?;
|
|
|
|
file.write_all(line.as_bytes()).await?;
|
|
Ok(())
|
|
} |