added data streaming instead of laoding
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
// src/corporate/openfigi.rs
|
||||
use super::{types::*};
|
||||
use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
|
||||
// src/corporate/openfigi.rs
|
||||
use super::{types::*};
|
||||
use reqwest::Client as HttpClient;
|
||||
use reqwest::header::{HeaderMap, HeaderValue};
|
||||
use serde_json::{json, Value};
|
||||
@@ -15,6 +15,7 @@ use tokio::time::{sleep, Duration};
|
||||
use tokio::fs as tokio_fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use anyhow::{Context, anyhow};
|
||||
use std::io::BufRead;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct OpenFigiClient {
|
||||
@@ -933,97 +934,6 @@ async fn remove_leis_batch_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Loads or builds HashMaps for companies, warrants, and options.
|
||||
///
|
||||
/// This function:
|
||||
/// 1. Attempts to load existing data from cache
|
||||
/// 2. Processes new FIGI data and classifies by securityType:
|
||||
/// - "Common Stock" → companies HashMap (grouped by ISIN)
|
||||
/// - "Equity WRT" → warrants HashMap (parsed from name)
|
||||
/// - "Equity Option" → options HashMap (parsed from name)
|
||||
/// 3. Updates/extends existing entries
|
||||
/// 4. Saves results to separate JSON files
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `figi_to_lei` - HashMap mapping LEI to Vec<FigiInfo>.
|
||||
///
|
||||
/// # Returns
|
||||
/// A tuple of (companies, warrants, options) HashMaps.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if file I/O fails or JSON serialization fails.
|
||||
pub async fn load_or_build_all_securities(
|
||||
figi_to_lei: &HashMap<String, Vec<FigiInfo>>
|
||||
) -> anyhow::Result<(
|
||||
HashMap<String, CompanyInfo>,
|
||||
HashMap<String, HashMap<String, WarrantInfo>>,
|
||||
HashMap<String, HashMap<String, OptionInfo>>
|
||||
)> {
|
||||
// Load existing data
|
||||
let mut commons = load_from_cache("data/corporate/by_name/common_stocks.json").await?
|
||||
.unwrap_or_else(HashMap::new);
|
||||
let mut warrants = load_from_cache("data/corporate/by_name/warrants.json").await?
|
||||
.unwrap_or_else(HashMap::new);
|
||||
let mut options = load_from_cache("data/corporate/by_name/options.json").await?
|
||||
.unwrap_or_else(HashMap::new);
|
||||
/*let mut preferred = load_from_cache("data/corporate/by_name/preferred.json").await?
|
||||
.unwrap_or_else(HashMap::new);*/
|
||||
|
||||
|
||||
println!("Loaded existing data:");
|
||||
println!(" - Companies: {}", commons.len());
|
||||
println!(" - Warrants: {}", warrants.len());
|
||||
println!(" - Options: {}", options.len());
|
||||
|
||||
let mut stats = ProcessingStats::new(commons.len(), warrants.len(), options.len());
|
||||
|
||||
println!("Processing {} LEI entries from FIGI data...", figi_to_lei.len());
|
||||
|
||||
for (_lei, figi_infos) in figi_to_lei.iter() {
|
||||
if figi_infos.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Group FigiInfos by security type
|
||||
let mut common_stocks = Vec::new();
|
||||
let mut warrant_securities = Vec::new();
|
||||
let mut option_securities = Vec::new();
|
||||
|
||||
for figi_info in figi_infos {
|
||||
match figi_info.security_type.as_str() {
|
||||
"Common Stock" => common_stocks.push(figi_info.clone()),
|
||||
"Equity WRT" => warrant_securities.push(figi_info.clone()),
|
||||
"Equity Option" => option_securities.push(figi_info.clone()),
|
||||
_ => {} // Ignore other types
|
||||
}
|
||||
}
|
||||
|
||||
// Process common stocks -> companies
|
||||
if !common_stocks.is_empty() {
|
||||
process_common_stocks(&mut commons, &common_stocks, &mut stats);
|
||||
}
|
||||
|
||||
// Process warrants
|
||||
if !warrant_securities.is_empty() {
|
||||
process_warrants(&mut warrants, &warrant_securities, &mut stats);
|
||||
}
|
||||
|
||||
// Process options
|
||||
if !option_securities.is_empty() {
|
||||
process_options(&mut options, &option_securities, &mut stats);
|
||||
}
|
||||
}
|
||||
|
||||
stats.print_summary(commons.len(), warrants.len(), options.len());
|
||||
|
||||
// Save all three HashMaps
|
||||
save_to_cache("data/corporate/by_name/common_stocks.json", &commons).await?;
|
||||
save_to_cache("data/corporate/by_name/warrants.json", &warrants).await?;
|
||||
save_to_cache("data/corporate/by_name/options.json", &options).await?;
|
||||
|
||||
Ok((commons, warrants, options))
|
||||
}
|
||||
|
||||
/// Statistics tracker for processing
|
||||
#[derive(Debug)]
|
||||
struct ProcessingStats {
|
||||
@@ -1583,5 +1493,299 @@ async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> {
|
||||
return Err(anyhow!("OpenFIGI API error: {}", status));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stream_gleif_csv<F>(
|
||||
csv_path: &str,
|
||||
mut callback: F
|
||||
) -> anyhow::Result<usize>
|
||||
where
|
||||
F: FnMut(String, String) -> anyhow::Result<()>,
|
||||
{
|
||||
logger::log_info(&format!("Streaming GLEIF CSV: {}", csv_path)).await;
|
||||
|
||||
let file = std::fs::File::open(csv_path)
|
||||
.context("Failed to open GLEIF CSV")?;
|
||||
|
||||
let reader = std::io::BufReader::new(file);
|
||||
let mut count = 0;
|
||||
|
||||
for (idx, line) in reader.lines().enumerate() {
|
||||
let line = line.context("Failed to read line")?;
|
||||
|
||||
// Skip header
|
||||
if idx == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse CSV line
|
||||
let parts: Vec<&str> = line.split(',').collect();
|
||||
if parts.len() < 2 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let lei = parts[0].trim().trim_matches('"').to_string();
|
||||
let isin = parts[1].trim().trim_matches('"').to_string();
|
||||
|
||||
if !lei.is_empty() && !isin.is_empty() {
|
||||
callback(lei, isin)?;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
// Yield periodically
|
||||
if count % 10000 == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Streamed {} LEI-ISIN pairs", count)).await;
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
/// Process FIGI mappings in batches instead of all at once
|
||||
pub async fn process_figi_mappings_streaming(
|
||||
lei_to_isins_stream: impl Iterator<Item = (String, Vec<String>)>,
|
||||
gleif_date: Option<&str>,
|
||||
batch_size: usize,
|
||||
) -> anyhow::Result<()> {
|
||||
let dir = DataPaths::new(".")?;
|
||||
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
|
||||
|
||||
let date = determine_gleif_date(gleif_date, &dir).await?;
|
||||
let date_dir = map_cache_dir.join(&date);
|
||||
tokio_fs::create_dir_all(&date_dir).await?;
|
||||
|
||||
// Setup sector directories
|
||||
let sector_dirs = load_market_sectors().await?;
|
||||
setup_sector_directories(&date_dir, §or_dirs).await?;
|
||||
|
||||
let client = OpenFigiClient::new().await?;
|
||||
if !client.has_key {
|
||||
logger::log_warn("No API key - limited FIGI mapping").await;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Process in batches
|
||||
let mut batch = Vec::new();
|
||||
let mut processed = 0;
|
||||
|
||||
for (lei, isins) in lei_to_isins_stream {
|
||||
batch.push((lei, isins));
|
||||
|
||||
if batch.len() >= batch_size {
|
||||
process_figi_batch(&client, &batch, &date_dir, §or_dirs).await?;
|
||||
processed += batch.len();
|
||||
|
||||
logger::log_info(&format!("Processed {} LEIs so far...", processed)).await;
|
||||
batch.clear();
|
||||
|
||||
// Yield to prevent blocking
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining
|
||||
if !batch.is_empty() {
|
||||
process_figi_batch(&client, &batch, &date_dir, §or_dirs).await?;
|
||||
processed += batch.len();
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Total processed: {} LEIs", processed)).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_figi_batch(
|
||||
client: &OpenFigiClient,
|
||||
batch: &[(String, Vec<String>)],
|
||||
date_dir: &Path,
|
||||
sector_dirs: &[String],
|
||||
) -> anyhow::Result<()> {
|
||||
for (lei, isins) in batch {
|
||||
let unique_isins: Vec<_> = isins.iter()
|
||||
.cloned()
|
||||
.collect::<std::collections::HashSet<_>>()
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
|
||||
|
||||
if figi_infos.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save to appropriate sector files
|
||||
save_figi_infos_by_sector(lei, &figi_infos, date_dir, sector_dirs).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_figi_infos_by_sector(
|
||||
lei: &str,
|
||||
figi_infos: &[FigiInfo],
|
||||
date_dir: &Path,
|
||||
_sector_dirs: &[String],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
|
||||
|
||||
for figi_info in figi_infos {
|
||||
let sector = if figi_info.market_sector.is_empty() {
|
||||
"uncategorized".to_string()
|
||||
} else {
|
||||
figi_info.market_sector.clone()
|
||||
};
|
||||
|
||||
by_sector.entry(sector).or_default().push(figi_info.clone());
|
||||
}
|
||||
|
||||
// Save to sector files
|
||||
for (sector, figis) in by_sector {
|
||||
let sector_dir = date_dir.join(§or);
|
||||
let path = sector_dir.join("lei_to_figi.jsonl");
|
||||
append_lei_to_figi_jsonl(&path, lei, &figis).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Modified load_or_build_all_securities to process in streaming fashion
|
||||
pub async fn load_or_build_all_securities_streaming(
|
||||
date_dir: &Path,
|
||||
) -> anyhow::Result<(
|
||||
HashMap<String, CompanyInfo>,
|
||||
HashMap<String, HashMap<String, WarrantInfo>>,
|
||||
HashMap<String, HashMap<String, OptionInfo>>
|
||||
)> {
|
||||
let mut commons = HashMap::new();
|
||||
let mut warrants = HashMap::new();
|
||||
let mut options = HashMap::new();
|
||||
|
||||
// Load existing data
|
||||
commons = load_from_cache("data/corporate/by_name/common_stocks.json")
|
||||
.await?
|
||||
.unwrap_or_default();
|
||||
warrants = load_from_cache("data/corporate/by_name/warrants.json")
|
||||
.await?
|
||||
.unwrap_or_default();
|
||||
options = load_from_cache("data/corporate/by_name/options.json")
|
||||
.await?
|
||||
.unwrap_or_default();
|
||||
|
||||
println!("Loaded existing data:");
|
||||
println!(" - Companies: {}", commons.len());
|
||||
println!(" - Warrants: {}", warrants.len());
|
||||
println!(" - Options: {}", options.len());
|
||||
|
||||
let mut stats = ProcessingStats::new(commons.len(), warrants.len(), options.len());
|
||||
|
||||
// Stream through JSONL files in date_dir
|
||||
let equity_file = date_dir.join("Equity").join("lei_to_figi.jsonl");
|
||||
|
||||
if equity_file.exists() {
|
||||
logger::log_info(&format!("Streaming FIGIs from {:?}", equity_file)).await;
|
||||
|
||||
let content = tokio_fs::read_to_string(&equity_file).await?;
|
||||
let mut processed = 0;
|
||||
|
||||
for line in content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entry: serde_json::Value = serde_json::from_str(line)?;
|
||||
let _lei = entry["lei"].as_str().unwrap_or("");
|
||||
let figi_infos: Vec<FigiInfo> = serde_json::from_value(
|
||||
entry["figis"].clone()
|
||||
)?;
|
||||
|
||||
// Process this batch
|
||||
process_figi_infos_batch(
|
||||
&figi_infos,
|
||||
&mut commons,
|
||||
&mut warrants,
|
||||
&mut options,
|
||||
&mut stats
|
||||
);
|
||||
|
||||
processed += 1;
|
||||
if processed % 100 == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stats.print_summary(commons.len(), warrants.len(), options.len());
|
||||
|
||||
// Save incrementally
|
||||
save_to_cache("data/corporate/by_name/common_stocks.json", &commons).await?;
|
||||
save_to_cache("data/corporate/by_name/warrants.json", &warrants).await?;
|
||||
save_to_cache("data/corporate/by_name/options.json", &options).await?;
|
||||
|
||||
Ok((commons, warrants, options))
|
||||
}
|
||||
|
||||
fn process_figi_infos_batch(
|
||||
figi_infos: &[FigiInfo],
|
||||
commons: &mut HashMap<String, CompanyInfo>,
|
||||
warrants: &mut HashMap<String, HashMap<String, WarrantInfo>>,
|
||||
options: &mut HashMap<String, HashMap<String, OptionInfo>>,
|
||||
stats: &mut ProcessingStats,
|
||||
) {
|
||||
let mut common_stocks = Vec::new();
|
||||
let mut warrant_securities = Vec::new();
|
||||
let mut option_securities = Vec::new();
|
||||
|
||||
for figi_info in figi_infos {
|
||||
match figi_info.security_type.as_str() {
|
||||
"Common Stock" => common_stocks.push(figi_info.clone()),
|
||||
"Equity WRT" => warrant_securities.push(figi_info.clone()),
|
||||
"Equity Option" => option_securities.push(figi_info.clone()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
if !common_stocks.is_empty() {
|
||||
process_common_stocks(commons, &common_stocks, stats);
|
||||
}
|
||||
|
||||
if !warrant_securities.is_empty() {
|
||||
process_warrants(warrants, &warrant_securities, stats);
|
||||
}
|
||||
|
||||
if !option_securities.is_empty() {
|
||||
process_options(options, &option_securities, stats);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
async fn determine_gleif_date(
|
||||
gleif_date: Option<&str>,
|
||||
paths: &DataPaths,
|
||||
) -> anyhow::Result<String> {
|
||||
if let Some(d) = gleif_date {
|
||||
Ok(d.to_string())
|
||||
} else {
|
||||
match find_most_recent_gleif_date(paths.cache_gleif_dir()).await? {
|
||||
Some(d) => Ok(d),
|
||||
None => Err(anyhow!("No GLEIF CSV file found")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn setup_sector_directories(
|
||||
date_dir: &Path,
|
||||
sector_dirs: &[String],
|
||||
) -> anyhow::Result<()> {
|
||||
// Create uncategorized folder
|
||||
let uncategorized_dir = date_dir.join("uncategorized");
|
||||
tokio_fs::create_dir_all(&uncategorized_dir).await?;
|
||||
|
||||
// Create sector folders
|
||||
for sector in sector_dirs {
|
||||
let sector_dir = date_dir.join(sector);
|
||||
tokio_fs::create_dir_all(§or_dir).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user