moved helper functions into helpers.rs

This commit is contained in:
2026-01-12 22:06:13 +01:00
parent 29d8f1d89e
commit 98e1bca12f
12 changed files with 436 additions and 292 deletions

View File

@@ -1,7 +1,8 @@
// src/corporate/update_openfigi.rs - STREAMING VERSION
// Key changes: Never load entire GLEIF CSV or FIGI maps into memory
use super::types::*;
use super::helpers::{find_most_recent_figi_date_dir, determine_gleif_date};
use super::bond_processing::*;
use super::option_processing::*;
use crate::util::directories::DataPaths;
use crate::util::integrity::{DataStage, StateManager, directory_reference};
use crate::util::logger;
@@ -15,76 +16,6 @@ use anyhow::{Context, anyhow};
const LEI_BATCH_SIZE: usize = 100; // Process 100 LEIs at a time
async fn process_and_save_figi_batch(
client: &OpenFigiClient,
lei_batch: &HashMap<String, Vec<String>>,
date_dir: &Path,
) -> anyhow::Result<()> {
for (lei, isins) in lei_batch {
let unique_isins: Vec<_> = isins.iter()
.cloned()
.collect::<HashSet<_>>()
.into_iter()
.collect();
let figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
if figi_infos.is_empty() {
// No FIGIs found - save to no_results.jsonl to avoid re-querying
append_no_result_lei(date_dir, lei, &unique_isins).await?;
continue;
}
// Save FIGIs by sector as before
save_figi_infos_by_sector(lei, &figi_infos, date_dir).await?;
}
Ok(())
}
async fn save_figi_infos_by_sector(
lei: &str,
figi_infos: &[FigiInfo],
date_dir: &Path,
) -> anyhow::Result<()> {
let mut by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
for figi_info in figi_infos {
let sector = if figi_info.market_sector.is_empty() {
"uncategorized".to_string()
} else {
figi_info.market_sector.clone()
};
by_sector.entry(sector).or_default().push(figi_info.clone());
}
for (sector, figis) in by_sector {
let sector_dir = date_dir.join(&sector);
let path = sector_dir.join("lei_to_figi.jsonl");
append_lei_to_figi_jsonl(&path, lei, &figis).await?;
}
Ok(())
}
async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> {
let entry = json!({
"lei": lei,
"figis": figis,
});
let line = serde_json::to_string(&entry)? + "\n";
let mut file = tokio_fs::OpenOptions::new()
.create(true)
.append(true)
.open(path)
.await?;
file.write_all(line.as_bytes()).await?;
Ok(())
}
/// Loads or builds securities data by streaming through FIGI mapping files.
///
/// Implements abort-safe incremental persistence with checkpoints and replay logs.
@@ -97,13 +28,16 @@ async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) ->
///
/// # Errors
/// Returns an error if file I/O fails or JSON parsing fails.
pub async fn update_securities(date_dir: &Path) -> anyhow::Result<()> {
pub async fn update_securities() -> anyhow::Result<()> {
logger::log_info("Building securities data from FIGI mappings...").await;
let dir = DataPaths::new(".")?;
let manager = StateManager::new(&dir.integrity_dir()).await?;
let step_name = "securities_data_complete";
let date_dir = find_most_recent_figi_date_dir(&dir).await?
.ok_or_else(|| anyhow!("No FIGI date directory found"))?;
let data_dir = dir.data_dir();
let output_dir = data_dir.join("figi_securities");
tokio_fs::create_dir_all(&output_dir).await
@@ -515,11 +449,11 @@ async fn process_lei_figi_file_batched(
let batch_size = 100;
let mut processed_count = 0;
let mut common_batch: Vec<CompanyInfo> = Vec::new();
let mut warrants_batch: Vec<WarrantInfo> = Vec::new();
let mut common_batch: Vec<CompanyData> = Vec::new();
let mut warrants_batch: Vec<WarrantData> = Vec::new();
let mut options_batch: Vec<OptionData> = Vec::new();
let mut corporate_bonds_batch: Vec<CorporateBondInfo> = Vec::new();
let mut government_bonds_batch: Vec<GovernmentBondInfo> = Vec::new();
let mut corporate_bonds_batch: Vec<CorporateBondData> = Vec::new();
let mut government_bonds_batch: Vec<GovernmentBondData> = Vec::new();
for (line_num, line) in content.lines().enumerate() {
if line.trim().is_empty() {
@@ -529,7 +463,7 @@ async fn process_lei_figi_file_batched(
let entry: Value = serde_json::from_str(line)
.context(format!("Failed to parse JSON on line {}", line_num + 1))?;
let figis: Vec<FigiInfo> = serde_json::from_value(entry["figis"].clone())
let figis: Vec<FigiData> = serde_json::from_value(entry["figis"].clone())
.context("Invalid 'figis' field")?;
if figis.is_empty() {
@@ -552,7 +486,7 @@ async fn process_lei_figi_file_batched(
if !warrant_securities.is_empty() {
for entry in prepare_warrant_entries(&warrant_securities, existing_warrants) {
// Add to existing set immediately
let key = format!("{}::{}", entry.underlying_company_name, entry.warrant_type);
let key = entry.company_id.clone();
existing_warrants.insert(key);
warrants_batch.push(entry);
}
@@ -561,7 +495,7 @@ async fn process_lei_figi_file_batched(
if !option_securities.is_empty() {
for entry in prepare_option_entries(&option_securities, existing_options) {
// Add to existing set immediately
let key = format!("{}::{}", entry.underlying_company_name, entry.option_type);
let key = entry.company_name.clone();
existing_options.insert(key);
options_batch.push(entry);
}
@@ -680,9 +614,9 @@ async fn write_batch_with_fsync<T: serde::Serialize>(
/// Prepares a common stock entry if it doesn't exist
fn prepare_common_stock_entry(
figi_infos: &[FigiInfo],
figi_infos: &[FigiData],
existing_keys: &HashSet<String>,
) -> Option<CompanyInfo> {
) -> Option<CompanyData> {
let name = figi_infos[0].name.clone();
if name.is_empty() || existing_keys.contains(&name) {
return None;
@@ -690,42 +624,78 @@ fn prepare_common_stock_entry(
let grouped_by_isin = group_figis_by_isin(figi_infos);
let primary_isin = grouped_by_isin.keys().next().cloned().unwrap_or_default();
let id = format!("company_{}", std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_nanos());
Some(CompanyInfo {
Some(CompanyData {
id,
name,
primary_isin,
securities: grouped_by_isin,
yahoo_company_data: None,
})
}
/// Prepares warrant entries for batching
fn prepare_warrant_entries(
warrant_securities: &[FigiInfo],
warrant_securities: &[FigiData],
existing_keys: &HashSet<String>,
) -> Vec<WarrantInfo> {
) -> Vec<WarrantData> {
let mut entries = Vec::new();
// Group by underlying company
let mut grouped: HashMap<String, Vec<(String, FigiData)>> = HashMap::new();
for figi in warrant_securities {
let (underlying, issuer, warrant_type) = parse_warrant_name(&figi.name);
let (underlying, _issuer, warrant_type) = parse_warrant_name(&figi.name);
if underlying.is_empty() {
continue;
}
let key = format!("{}::{}", underlying, warrant_type);
if existing_keys.contains(&key) {
grouped.entry(underlying.clone())
.or_default()
.push((warrant_type, figi.clone()));
}
// Create WarrantData for each underlying company
for (underlying_company, contracts) in grouped {
if existing_keys.contains(&underlying_company) {
continue;
}
let warrant_info = WarrantInfo {
underlying_company_name: underlying.clone(),
issuer_company_name: issuer,
warrant_type: warrant_type.clone(),
warrants: {
let mut map = HashMap::new();
map.insert(figi.isin.clone(), vec![figi.clone()]);
map
},
let company_id = format!("warrant_{}", std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_nanos());
let mut warrants_by_type: HashMap<String, WarrantDetails> = HashMap::new();
for (warrant_type, figi) in contracts {
let (_, issuer, _) = parse_warrant_name(&figi.name);
let warrant_detail = WarrantDetails {
company_id: company_id.clone(),
company_name: underlying_company.clone(),
issuer_company_name: issuer,
warrant_type: warrant_type.clone(),
warrants: {
let mut map = HashMap::new();
map.insert(figi.isin.clone(), vec![figi.clone()]);
map
},
};
let key = format!("{}_{}", underlying_company, warrant_type);
warrants_by_type.insert(key, warrant_detail);
}
let warrant_info = WarrantData {
company_id,
company_name: underlying_company.clone(),
warrants: warrants_by_type,
};
entries.push(warrant_info);
@@ -735,36 +705,105 @@ fn prepare_warrant_entries(
}
/// Prepares option entries for batching
///
/// Groups option contracts by underlying company, extracts strike prices and expiration dates,
/// and builds OptionChain structures organizing calls and puts by expiration date.
///
/// # Arguments
/// * `option_securities` - List of FigiData objects for option contracts
/// * `existing_keys` - Set of already-processed keys (format: "company_name")
///
/// # Returns
/// Vector of OptionData entries, one per unique underlying company
fn prepare_option_entries(
option_securities: &[FigiInfo],
option_securities: &[FigiData],
existing_keys: &HashSet<String>,
) -> Vec<OptionData> {
let mut entries = Vec::new();
// Group by underlying company
let mut grouped: HashMap<String, Vec<(String, FigiData)>> = HashMap::new();
for figi in option_securities {
let (underlying, issuer, option_type) = parse_option_name(&figi.name);
let (underlying, _issuer, option_type) = parse_option_name(&figi.name);
if underlying.is_empty() {
continue;
}
let key = format!("{}::{}", underlying, option_type);
if existing_keys.contains(&key) {
grouped.entry(underlying.clone())
.or_default()
.push((option_type, figi.clone()));
}
// Create OptionData for each underlying company
for (underlying_company, contracts) in grouped {
if existing_keys.contains(&underlying_company) {
continue;
}
let option_info = OptionData {
underlying_company_name: underlying.clone(),
issuer_company_name: issuer,
option_type: option_type.clone(),
options: {
let mut map = HashMap::new();
map.insert(figi.isin.clone(), vec![figi.clone()]);
map
},
// Build OptionContracts and extract strikes/expirations
let mut option_contracts: HashMap<i64, (Vec<OptionContract>, Vec<OptionContract>)> = HashMap::new();
let mut all_strikes: std::collections::HashSet<u64> = std::collections::HashSet::new();
for (option_type, figi) in contracts {
// Parse strike price from ticker if available
let strike = parse_strike_from_ticker(&figi.ticker).unwrap_or(0.0);
let expiration = parse_expiration_from_ticker(&figi.ticker).unwrap_or(0);
if strike > 0.0 && expiration > 0 {
all_strikes.insert((strike * 100.0) as u64);
let contract = OptionContract {
strike,
last_price: None,
bid: None,
ask: None,
volume: None,
open_interest: None,
implied_volatility: None,
};
let entry = option_contracts.entry(expiration).or_insert((Vec::new(), Vec::new()));
match option_type.as_str() {
"call" => entry.0.push(contract),
"put" => entry.1.push(contract),
_ => {}
}
}
}
// Build OptionChains from contracts
let mut option_chains = Vec::new();
let mut expiration_dates = Vec::new();
for (expiration, (calls, puts)) in option_contracts {
expiration_dates.push(expiration);
option_chains.push(OptionChain {
expiration_date: expiration,
calls,
puts,
});
}
expiration_dates.sort();
option_chains.sort_by_key(|oc| oc.expiration_date);
let strikes: Vec<f64> = all_strikes
.iter()
.map(|s| *s as f64 / 100.0)
.collect::<Vec<_>>();
let option_data = OptionData {
company_id: underlying_company.clone(),
company_name: underlying_company.clone(),
expiration_dates,
strikes,
option: option_chains,
timestamp: chrono::Utc::now().timestamp(),
};
entries.push(option_info);
entries.push(option_data);
}
entries
@@ -782,13 +821,13 @@ fn prepare_option_entries(
/// # Returns
/// Vector of CorporateBondInfo entries, one per unique issuer
fn prepare_corporate_bond_entries(
corporate_bond_securities: &[FigiInfo],
corporate_bond_securities: &[FigiData],
existing_keys: &HashSet<String>,
) -> Vec<CorporateBondInfo> {
) -> Vec<CorporateBondData> {
let mut entries = Vec::new();
// Group bonds by issuer (company name)
let mut grouped: HashMap<String, Vec<FigiInfo>> = HashMap::new();
let mut grouped: HashMap<String, Vec<FigiData>> = HashMap::new();
for figi in corporate_bond_securities {
let issuer = figi.name.clone();
@@ -819,7 +858,7 @@ fn prepare_corporate_bond_entries(
}
}
let bond_info = CorporateBondInfo {
let bond_info = CorporateBondData {
underlying_company_name: issuer.clone(),
bonds: bonds_by_isin,
bond_details: bond_details_map,
@@ -844,13 +883,13 @@ fn prepare_corporate_bond_entries(
/// # Returns
/// Vector of GovernmentBondInfo entries, one per unique issuer
fn prepare_government_bond_entries(
government_bond_securities: &[FigiInfo],
government_bond_securities: &[FigiData],
existing_keys: &HashSet<String>,
) -> Vec<GovernmentBondInfo> {
) -> Vec<GovernmentBondData> {
let mut entries = Vec::new();
// Group bonds by issuer (country/entity name)
let mut grouped: HashMap<String, Vec<FigiInfo>> = HashMap::new();
let mut grouped: HashMap<String, Vec<FigiData>> = HashMap::new();
for figi in government_bond_securities {
let issuer = figi.name.clone();
@@ -884,7 +923,7 @@ fn prepare_government_bond_entries(
}
}
let bond_info = GovernmentBondInfo {
let bond_info = GovernmentBondData {
issuer_name: issuer.clone(),
issuer_type,
bonds: bonds_by_isin,
@@ -898,12 +937,12 @@ fn prepare_government_bond_entries(
}
/// Groups FigiInfo list by security type
fn group_securities(figis: &[FigiInfo]) -> (Vec<FigiInfo>, Vec<FigiInfo>, Vec<FigiInfo>, Vec<FigiInfo>, Vec<FigiInfo>) {
let mut common_stocks:Vec<FigiInfo> = Vec::new();
let mut warrants:Vec<FigiInfo> = Vec::new();
let mut options:Vec<FigiInfo> = Vec::new();
let mut corporate_bonds:Vec<FigiInfo> = Vec::new();
let mut government_bonds:Vec<FigiInfo> = Vec::new();
fn group_securities(figis: &[FigiData]) -> (Vec<FigiData>, Vec<FigiData>, Vec<FigiData>, Vec<FigiData>, Vec<FigiData>) {
let mut common_stocks:Vec<FigiData> = Vec::new();
let mut warrants:Vec<FigiData> = Vec::new();
let mut options:Vec<FigiData> = Vec::new();
let mut corporate_bonds:Vec<FigiData> = Vec::new();
let mut government_bonds:Vec<FigiData> = Vec::new();
for figi in figis {
match figi.security_type.as_str() {
@@ -923,8 +962,8 @@ fn group_securities(figis: &[FigiInfo]) -> (Vec<FigiInfo>, Vec<FigiInfo>, Vec<Fi
}
/// Groups FigiInfo by ISIN
fn group_figis_by_isin(figi_infos: &[FigiInfo]) -> HashMap<String, Vec<FigiInfo>> {
let mut grouped: HashMap<String, Vec<FigiInfo>> = HashMap::new();
fn group_figis_by_isin(figi_infos: &[FigiData]) -> HashMap<String, Vec<FigiData>> {
let mut grouped: HashMap<String, Vec<FigiData>> = HashMap::new();
for figi_info in figi_infos {
grouped.entry(figi_info.isin.clone())
@@ -994,33 +1033,6 @@ fn parse_warrant_name(name: &str) -> (String, Option<String>, String) {
(name.to_string(), None, warrant_type)
}
/// Parse option name to extract underlying company, issuer, and option type
///
/// Examples:
/// - "December 25 Calls on ALPHA GA" -> ("ALPHA GA", None, "call")
/// - "January 26 Puts on TESLA INC" -> ("TESLA INC", None, "put")
fn parse_option_name(name: &str) -> (String, Option<String>, String) {
let name_upper = name.to_uppercase();
// Detect option type
let option_type = if name_upper.contains("CALL") {
"call".to_string()
} else if name_upper.contains("PUT") {
"put".to_string()
} else {
"unknown".to_string()
};
// Try to extract underlying after "on"
if let Some(pos) = name_upper.find(" ON ") {
let underlying = name[pos + 4..].trim().to_string();
return (underlying, None, option_type);
}
// Fallback: return entire name
(name.to_string(), None, option_type)
}
/// Statistics tracker for streaming processing
#[derive(Debug)]
struct StreamingStats {
@@ -1104,33 +1116,6 @@ async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
Ok(sectors)
}
async fn determine_gleif_date(
gleif_date: Option<&str>,
paths: &DataPaths,
) -> anyhow::Result<String> {
if let Some(d) = gleif_date {
return Ok(d.to_string());
}
let gleif_dir = paths.cache_gleif_dir();
let mut entries = tokio_fs::read_dir(gleif_dir).await?;
let mut dates = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
dates.push(name.to_string());
}
}
}
}
dates.sort();
dates.last().cloned().ok_or_else(|| anyhow!("No GLEIF date found"))
}
async fn setup_sector_directories(
date_dir: &Path,
sector_dirs: &[String],
@@ -1570,4 +1555,74 @@ async fn append_no_result_lei(date_dir: &Path, lei: &str, isins: &[String]) -> a
file.write_all(line.as_bytes()).await?;
Ok(())
}
}
async fn process_and_save_figi_batch(
client: &OpenFigiClient,
lei_batch: &HashMap<String, Vec<String>>,
date_dir: &Path,
) -> anyhow::Result<()> {
for (lei, isins) in lei_batch {
let unique_isins: Vec<_> = isins.iter()
.cloned()
.collect::<HashSet<_>>()
.into_iter()
.collect();
let figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
if figi_infos.is_empty() {
// No FIGIs found - save to no_results.jsonl to avoid re-querying
append_no_result_lei(date_dir, lei, &unique_isins).await?;
continue;
}
// Save FIGIs by sector as before
save_figi_infos_by_sector(lei, &figi_infos, date_dir).await?;
}
Ok(())
}
async fn save_figi_infos_by_sector(
lei: &str,
figi_infos: &[FigiData],
date_dir: &Path,
) -> anyhow::Result<()> {
let mut by_sector: HashMap<String, Vec<FigiData>> = HashMap::new();
for figi_info in figi_infos {
let sector = if figi_info.market_sector.is_empty() {
"uncategorized".to_string()
} else {
figi_info.market_sector.clone()
};
by_sector.entry(sector).or_default().push(figi_info.clone());
}
for (sector, figis) in by_sector {
let sector_dir = date_dir.join(&sector);
let path = sector_dir.join("lei_to_figi.jsonl");
append_lei_to_figi_jsonl(&path, lei, &figis).await?;
}
Ok(())
}
async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiData]) -> anyhow::Result<()> {
let entry = json!({
"lei": lei,
"figis": figis,
});
let line = serde_json::to_string(&entry)? + "\n";
let mut file = tokio_fs::OpenOptions::new()
.create(true)
.append(true)
.open(path)
.await?;
file.write_all(line.as_bytes()).await?;
Ok(())
}