moved data capturing into cache folder

This commit is contained in:
2025-12-05 22:32:42 +01:00
parent 58a498e694
commit a6823dc938
6 changed files with 230 additions and 80 deletions

View File

@@ -1,15 +1,18 @@
use crate::util::directories::DataPaths;
// src/corporate/openfigi.rs
use super::{types::*};
use reqwest::Client as HttpClient;
use reqwest::header::{HeaderMap, HeaderValue};
use serde_json::{json, Value};
use std::collections::{HashMap, HashSet};
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::path::Path;
use std::fs::File;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::time::Instant;
use tokio::time::{sleep, Duration};
use tokio::fs as tokio_fs;
use tokio::io::AsyncWriteExt;
use anyhow::{Context, anyhow};
#[derive(Clone)]
@@ -103,7 +106,7 @@ impl OpenFigiClient {
.map(|isin| json!({
"idType": "ID_ISIN",
"idValue": isin,
"marketSecDes": "Equity",
//"marketSecDes": "Equity",
}))
.collect();
@@ -195,9 +198,69 @@ impl OpenFigiClient {
}
}
/// Extracts the date from a GLEIF CSV filename in the format "isin-lei-DDMMYYYY.csv".
///
/// # Arguments
///
/// * `filename` - The GLEIF CSV filename (e.g., "isin-lei-24112025.csv")
///
/// # Returns
///
/// A string in the format "DDMMYYYY" (e.g., "24112025") if successfully parsed, otherwise the original filename.
fn extract_gleif_date_from_filename(filename: &str) -> String {
// Pattern: isin-lei-DDMMYYYY.csv
if let Some(start_idx) = filename.find("isin-lei-") {
let rest = &filename[start_idx + 9..]; // Skip "isin-lei-"
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
return rest[0..8].to_string();
}
}
filename.to_string()
}
/// Finds the most recent GLEIF CSV file in the cache/gleif directory.
///
/// Returns the extracted date in format "DDMMYYYY" from the filename.
/// If no GLEIF file is found, returns None.
async fn find_most_recent_gleif_date(gleif_cache_dir: &Path) -> anyhow::Result<Option<String>> {
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
.await
.context("Failed to read gleif cache directory")?;
let mut csv_files = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(filename) = path.file_name() {
let filename_str = filename.to_string_lossy();
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
csv_files.push(filename_str.to_string());
}
}
}
if csv_files.is_empty() {
return Ok(None);
}
// Sort files in reverse order (most recent first) based on date in filename
csv_files.sort();
csv_files.reverse();
let most_recent = &csv_files[0];
let date = extract_gleif_date_from_filename(most_recent);
println!(" Found GLEIF data dated: {}", date);
Ok(Some(date))
}
/// Builds a LEI-to-FigiInfo map from the LEI-ISIN mapping, filtering for equities via OpenFIGI.
///
/// Attempts to load existing entries from "data/corporate/by_lei/lei_to_figi.jsonl" (JSON Lines format,
/// The mapping is stored in cache/glei_openfigi/{GLEIF_DATE}/lei_to_figi.jsonl, where GLEIF_DATE
/// is extracted from the GLEIF CSV filename (format: DDMMYYYY). If no specific date is provided,
/// the most recent GLEIF file in cache/gleif is used.
///
/// Attempts to load existing entries from the date-based jsonl file (JSON Lines format,
/// one LEI entry per line: {"lei": "ABC", "figis": [FigiInfo...]}). For any missing LEIs (compared to
/// `lei_to_isins`), fetches their FigiInfos and appends to the .jsonl file incrementally.
///
@@ -209,6 +272,7 @@ impl OpenFigiClient {
/// # Arguments
///
/// * `lei_to_isins` - HashMap of LEI to Vec<ISIN> (used for fetching missing entries).
/// * `gleif_date` - Optional date in format "DDMMYYYY". If None, uses the most recent GLEIF file.
///
/// # Returns
///
@@ -218,12 +282,29 @@ impl OpenFigiClient {
///
/// Returns an error if file I/O fails, JSON serialization/deserialization fails,
/// or if OpenFIGI queries fail during fetching.
pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
let data_dir = Path::new("data/corporate/by_lei");
tokio_fs::create_dir_all(data_dir).await.context("Failed to create data directory")?;
let path = data_dir.join("lei_to_figi.jsonl");
let mut lei_to_figis: HashMap<String, Vec<FigiInfo>> = load_lei_to_figi_jsonl(&path)?;
pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>, gleif_date: Option<&str>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
let dir = DataPaths::new(".")?;
let gleif_cache_dir = dir.cache_gleif_dir();
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
// Determine the GLEIF date to use
let date = if let Some(d) = gleif_date {
d.to_string()
} else {
// Find the most recent GLEIF file
match find_most_recent_gleif_date(gleif_cache_dir).await? {
Some(d) => d,
None => return Err(anyhow!("No GLEIF CSV file found in cache/gleif directory")),
}
};
// Create date-based subdirectory in the mapping cache
let date_dir = map_cache_dir.join(&date);
tokio_fs::create_dir_all(&date_dir).await.context("Failed to create date directory")?;
let path = date_dir.join("lei_to_figi.jsonl");
println!("Using LEI→FIGI mapping at: {}", path.display());
let mut lei_to_figis: HashMap<String, Vec<FigiInfo>> = load_lei_to_figi_jsonl(&path).await?;
let client = OpenFigiClient::new()?;
if !client.has_key {
@@ -258,7 +339,7 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
}
// Append to .jsonl incrementally
append_lei_to_figi_jsonl(&path, &lei, &figis).context("Failed to append to JSONL")?;
append_lei_to_figi_jsonl(&path, &lei, &figis).await.context("Failed to append to JSONL")?;
// Insert into in-memory map
lei_to_figis.insert(lei.clone(), figis);
@@ -290,18 +371,16 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
/// # Errors
///
/// Returns an error if the file cannot be opened or if any line fails to parse as JSON.
fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
async fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
let mut map = HashMap::new();
if !path.exists() {
return Ok(map);
}
let file = File::open(path).context("Failed to open JSONL file for reading")?;
let reader = BufReader::new(file);
let content = tokio_fs::read_to_string(path).await.context("Failed to read JSONL file")?;
for (line_num, line) in reader.lines().enumerate() {
let line = line.context(format!("Failed to read line {}", line_num + 1))?;
for (line_num, line) in content.lines().enumerate() {
if line.trim().is_empty() {
continue;
}
@@ -328,20 +407,24 @@ fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, Vec<Fig
/// # Errors
///
/// Returns an error if the file cannot be opened for append or if serialization fails.
fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> {
let mut file = OpenOptions::new()
.create(true)
.append(true)
.open(path)
.context("Failed to open JSONL file for append")?;
async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> {
let entry = json!({
"lei": lei,
"figis": figis,
});
let line = serde_json::to_string(&entry).context("Failed to serialize entry")? + "\n";
file.write_all(line.as_bytes()).context("Failed to write to JSONL file")?;
let mut file = tokio_fs::OpenOptions::new()
.create(true)
.append(true)
.open(path)
.await
.context("Failed to open JSONL file for append")?;
file.write_all(line.as_bytes())
.await
.context("Failed to write to JSONL file")?;
Ok(())
}
@@ -373,19 +456,22 @@ pub async fn load_or_build_all_securities(
HashMap<String, HashMap<String, OptionInfo>>
)> {
// Load existing data
let mut companies = load_from_cache("data/corporate/by_name/common_stocks.json").await?
let mut commons = load_from_cache("data/corporate/by_name/common_stocks.json").await?
.unwrap_or_else(HashMap::new);
let mut warrants = load_from_cache("data/corporate/by_name/warrants.json").await?
.unwrap_or_else(HashMap::new);
let mut options = load_from_cache("data/corporate/by_name/options.json").await?
.unwrap_or_else(HashMap::new);
/*let mut preferred = load_from_cache("data/corporate/by_name/preferred.json").await?
.unwrap_or_else(HashMap::new);*/
println!("Loaded existing data:");
println!(" - Companies: {}", companies.len());
println!(" - Companies: {}", commons.len());
println!(" - Warrants: {}", warrants.len());
println!(" - Options: {}", options.len());
let mut stats = ProcessingStats::new(companies.len(), warrants.len(), options.len());
let mut stats = ProcessingStats::new(commons.len(), warrants.len(), options.len());
println!("Processing {} LEI entries from FIGI data...", figi_to_lei.len());
@@ -410,7 +496,7 @@ pub async fn load_or_build_all_securities(
// Process common stocks -> companies
if !common_stocks.is_empty() {
process_common_stocks(&mut companies, &common_stocks, &mut stats);
process_common_stocks(&mut commons, &common_stocks, &mut stats);
}
// Process warrants
@@ -424,14 +510,14 @@ pub async fn load_or_build_all_securities(
}
}
stats.print_summary(companies.len(), warrants.len(), options.len());
stats.print_summary(commons.len(), warrants.len(), options.len());
// Save all three HashMaps
save_to_cache("data/corporate/by_name/common_stocks.json", &companies).await?;
save_to_cache("data/corporate/by_name/common_stocks.json", &commons).await?;
save_to_cache("data/corporate/by_name/warrants.json", &warrants).await?;
save_to_cache("data/corporate/by_name/options.json", &options).await?;
Ok((companies, warrants, options))
Ok((commons, warrants, options))
}
/// Statistics tracker for processing
@@ -795,7 +881,8 @@ pub async fn load_figi_type_lists() -> anyhow::Result<()> {
let client = OpenFigiClient::new()?;
// Create cache directory
let cache_dir = Path::new("data/openfigi");
let dir = DataPaths::new(".")?;
let cache_dir = dir.cache_openfigi_dir();
tokio_fs::create_dir_all(cache_dir).await
.context("Failed to create data/openfigi directory")?;