migrated checkpoint handling in integrity.rs to ssot principle

This commit is contained in:
2026-01-11 13:05:31 +01:00
parent 0487c2ec49
commit aff340ee2f
15 changed files with 880 additions and 579 deletions

View File

@@ -1,159 +1,21 @@
// src/corporate/openfigi.rs - STREAMING VERSION
// src/corporate/update_openfigi.rs - STREAMING VERSION
// Key changes: Never load entire GLEIF CSV or FIGI maps into memory
use crate::util::directories::DataPaths;
use crate::util::integrity::{DataStage, StateManager, directory_reference};
use crate::util::logger;
use crate::scraper::openfigi::{OpenFigiClient};
use super::types::*;
use reqwest::Client as HttpClient;
use reqwest::header::{HeaderMap, HeaderValue};
use serde_json::{json, Value};
use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::io::{BufRead, BufReader};
use tokio::time::{sleep, Duration};
use tokio::fs as tokio_fs;
use tokio::io::AsyncWriteExt;
use anyhow::{Context, anyhow};
const LEI_BATCH_SIZE: usize = 100; // Process 100 LEIs at a time
#[derive(Clone)]
pub struct OpenFigiClient {
client: HttpClient,
has_key: bool,
}
impl OpenFigiClient {
pub async fn new() -> anyhow::Result<Self> {
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
let has_key = api_key.is_some();
let mut builder = HttpClient::builder()
.user_agent("Mozilla/5.0 (compatible; OpenFIGI-Rust/1.0)")
.timeout(Duration::from_secs(30));
if let Some(key) = &api_key {
let mut headers = HeaderMap::new();
headers.insert("X-OPENFIGI-APIKEY", HeaderValue::from_str(key)?);
builder = builder.default_headers(headers);
}
let client = builder.build().context("Failed to build HTTP client")?;
logger::log_info(&format!("OpenFIGI client: {}",
if has_key { "with API key" } else { "no key" })).await;
Ok(Self { client, has_key })
}
pub async fn map_isins_to_figi_infos(&self, isins: &[String]) -> anyhow::Result<Vec<FigiInfo>> {
if isins.is_empty() {
return Ok(vec![]);
}
let mut all_figi_infos = Vec::new();
let chunk_size = if self.has_key { 100 } else { 5 };
let inter_sleep = if self.has_key {
Duration::from_millis(240)
} else {
Duration::from_millis(2400)
};
for chunk in isins.chunks(chunk_size) {
let jobs: Vec<Value> = chunk.iter()
.map(|isin| json!({
"idType": "ID_ISIN",
"idValue": isin,
}))
.collect();
let mut retry_count = 0;
let max_retries = 5;
let mut backoff_ms = 1000u64;
loop {
let resp_result = self.client
.post("https://api.openfigi.com/v3/mapping")
.header("Content-Type", "application/json")
.json(&jobs)
.send()
.await;
let resp = match resp_result {
Ok(r) => r,
Err(e) => {
retry_count += 1;
if retry_count >= max_retries {
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
logger::log_error(&err_msg).await;
return Err(anyhow!(err_msg));
}
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
logger::log_warn(&warn_msg).await;
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
logger::log_info(&retry_msg).await;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
continue;
}
};
let status = resp.status();
let headers = resp.headers().clone();
let body = resp.text().await?;
if status == 429 {
let reset_sec = headers
.get("ratelimit-reset")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
sleep(Duration::from_secs(reset_sec.max(10))).await;
continue;
} else if !status.is_success() {
if status.is_server_error() && retry_count < max_retries {
retry_count += 1;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000);
continue;
}
return Err(anyhow!("OpenFIGI error {}: {}", status, body));
}
let results: Vec<Value> = serde_json::from_str(&body)?;
for (isin, result) in chunk.iter().zip(results) {
if let Some(data) = result["data"].as_array() {
for item in data {
if let Some(figi) = item["figi"].as_str() {
all_figi_infos.push(FigiInfo {
isin: isin.clone(),
figi: figi.to_string(),
name: item["name"].as_str().unwrap_or("").to_string(),
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
exch_code: item["exchCode"].as_str().unwrap_or("").to_string(),
composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
security_type: item["securityType"].as_str().unwrap_or("").to_string(),
market_sector: item["marketSector"].as_str().unwrap_or("").to_string(),
share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
security_type2: item["securityType2"].as_str().unwrap_or("").to_string(),
security_description: item["securityDescription"].as_str().unwrap_or("").to_string(),
});
}
}
}
}
break;
}
sleep(inter_sleep).await;
}
Ok(all_figi_infos)
}
}
async fn process_and_save_figi_batch(
client: &OpenFigiClient,
lei_batch: &HashMap<String, Vec<String>>,
@@ -224,41 +86,6 @@ async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) ->
Ok(())
}
/// Handles rate limit responses from the OpenFIGI API.
///
/// If a 429 status is received, this function sleeps for the duration specified
/// in the `ratelimit-reset` header (or 10 seconds by default).
///
/// # Arguments
/// * `resp` - The HTTP response to check.
///
/// # Returns
/// Ok(()) if no rate limit, or after waiting for the reset period.
///
/// # Errors
/// Returns an error if the response status indicates a non-rate-limit error.
async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> {
let status = resp.status();
if status == 429 {
let headers = resp.headers();
let reset_sec = headers
.get("ratelimit-reset")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
logger::log_info(&format!(" Rate limited—waiting {}s", reset_sec)).await;
sleep(std::time::Duration::from_secs(reset_sec.max(10))).await;
return Err(anyhow!("Rate limited, please retry"));
} else if status.is_client_error() || status.is_server_error() {
return Err(anyhow!("OpenFIGI API error: {}", status));
}
Ok(())
}
/// Loads or builds securities data by streaming through FIGI mapping files.
///
/// Implements abort-safe incremental persistence with checkpoints and replay logs.
@@ -276,7 +103,7 @@ pub async fn update_securities(date_dir: &Path) -> anyhow::Result<()> {
let dir = DataPaths::new(".")?;
let state_path = dir.data_dir().join("state.jsonl");
let manager = StateManager::new(&state_path, &dir.data_dir().to_path_buf());
let manager = StateManager::new(&state_path, &dir.data_dir().to_path_buf())?;
let step_name = "securities_data_complete";
let data_dir = dir.data_dir();
@@ -432,7 +259,6 @@ async fn track_securities_completion(
"securities_data_complete".to_string(),
content_reference,
DataStage::Data,
vec!["lei_figi_mapping_complete".to_string()], // Depends on LEI mapping
None, // Use default TTL (7 days)
).await?;
@@ -1110,220 +936,6 @@ async fn setup_sector_directories(
Ok(())
}
/// Loads all OpenFIGI mapping value lists (marketSecDes, micCode, securityType).
///
/// This function fetches the available values for each mapping parameter from the OpenFIGI API
/// and caches them as JSON files in `data/openfigi/`. If the files already exist and are recent
/// (less than 30 days old), they are reused instead of re-fetching.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if API requests fail, JSON parsing fails, or file I/O fails.
pub async fn load_figi_type_lists(paths: &DataPaths) -> anyhow::Result<()> {
logger::log_info("Loading OpenFIGI mapping value lists...").await;
let state_path = paths.cache_dir().join("state.jsonl");
let cache_openfigi_dir = paths.cache_openfigi_dir();
tokio_fs::create_dir_all(cache_openfigi_dir).await
.context("Failed to create data/openfigi directory")?;
/*if state_path.exists() {
let state_content = tokio::fs::read_to_string(&state_path).await?;
for line in state_content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
if state.get("yahoo_companies_cleansed_no_data").and_then(|v| v.as_bool()).unwrap_or(false) {
logger::log_info(" Yahoo companies cleansing already completed, reading existing file...").await;
if output_path.exists() {
let output_content = tokio::fs::read_to_string(&output_path).await?;
let count = output_content.lines()
.filter(|line| !line.trim().is_empty())
.count();
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo.jsonl", count)).await;
return Ok(count);
} else {
logger::log_warn(" State indicates completion but companies_yahoo.jsonl not found, re-running...").await;
break;
}
}
}
}
}*/
let client = OpenFigiClient::new().await?;
// Fetch each type list
get_figi_market_sec_des(&client, cache_openfigi_dir).await?;
get_figi_mic_code(&client, cache_openfigi_dir).await?;
get_figi_security_type(&client, cache_openfigi_dir).await?;
logger::log_info("OpenFIGI mapping value lists loaded successfully").await;
Ok(())
}
/// Fetches and caches the list of valid marketSecDes values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_market_sec_des(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("marketSecDes.json");
// Check if cache exists and is recent (< 30 days old)
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached marketSecDes values").await;
return Ok(());
}
logger::log_info(" Fetching marketSecDes values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/marketSecDes")
.send()
.await
.context("Failed to fetch marketSecDes values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse marketSecDes response")?;
// Save to cache
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write marketSecDes cache")?;
logger::log_info(" ✓ Cached marketSecDes values").await;
// Respect rate limits
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Fetches and caches the list of valid micCode values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_mic_code(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("micCode.json");
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached micCode values").await;
return Ok(());
}
logger::log_info(" Fetching micCode values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/micCode")
.send()
.await
.context("Failed to fetch micCode values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse micCode response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write micCode cache")?;
logger::log_info(" ✓ Cached micCode values").await;
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Checks if a cache file exists and is less than 30 days old.
///
/// # Arguments
/// * `path` - Path to the cache file.
///
/// # Returns
/// True if the cache should be used, false if it needs refreshing.
async fn should_use_cache(path: &Path) -> anyhow::Result<bool> {
if !path.exists() {
return Ok(false);
}
let metadata = tokio_fs::metadata(path).await?;
let modified = metadata.modified()?;
let age = modified.elapsed().unwrap_or(std::time::Duration::from_secs(u64::MAX));
// Cache is valid for 30 days
Ok(age < std::time::Duration::from_secs(30 * 24 * 60 * 60))
}
/// Fetches and caches the list of valid securityType values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_security_type(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("securityType.json");
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached securityType values").await;
return Ok(());
}
logger::log_info(" Fetching securityType values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/securityType")
.send()
.await
.context("Failed to fetch securityType values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse securityType response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write securityType cache")?;
logger::log_info(" ✓ Cached securityType values").await;
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
#[derive(Debug)]
pub struct MappingStats {
pub total_leis: usize,
@@ -1621,7 +1233,7 @@ pub async fn update_lei_mapping(
) -> anyhow::Result<bool> {
let dir = DataPaths::new(".")?;
let state_path = dir.cache_dir().join("state.jsonl");
let manager = StateManager::new(&state_path, &dir.cache_dir().to_path_buf());
let manager = StateManager::new(&state_path, &dir.cache_dir().to_path_buf())?;
let step_name = "lei_figi_mapping_complete";
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
@@ -1694,7 +1306,6 @@ async fn track_lei_mapping_completion(
"lei_figi_mapping_complete".to_string(),
content_reference,
DataStage::Cache, // 24-hour TTL for API data
vec![], // No dependencies
None, // Use default TTL
).await?;