This commit is contained in:
2026-01-12 01:01:19 +01:00
parent bd74f36f4c
commit 659757482d
13 changed files with 526 additions and 93 deletions

View File

@@ -102,13 +102,13 @@ pub async fn update_securities(date_dir: &Path) -> anyhow::Result<()> {
logger::log_info("Building securities data from FIGI mappings...").await;
let dir = DataPaths::new(".")?;
let state_path = dir.data_dir().join("state.jsonl");
let manager = StateManager::new(&dir.integrity_dir())?;
let manager = StateManager::new(&dir.integrity_dir()).await?;
let step_name = "securities_data_complete";
let data_dir = dir.data_dir();
let corporate_data_dir = data_dir.join("corporate");
let output_dir = corporate_data_dir.join("by_name");
let economic_data_dir = data_dir.join("economic");
let output_dir = data_dir.join("by_name");
tokio_fs::create_dir_all(&output_dir).await
.context("Failed to create corporate/by_name directory")?;
@@ -130,6 +130,10 @@ pub async fn update_securities(date_dir: &Path) -> anyhow::Result<()> {
let warrants_log = output_dir.join("warrants.log.jsonl");
let options_checkpoint = output_dir.join("options.jsonl");
let options_log = output_dir.join("options.log.jsonl");
let corporate_bonds_checkpoint = output_dir.join("corporate_bonds.jsonl");
let corporate_bonds_log = output_dir.join("corporate_bonds.log.jsonl");
let government_bonds_checkpoint = output_dir.join("government_bonds.jsonl");
let government_bonds_log = output_dir.join("government_bonds.log.jsonl");
// Track which sectors have been fully processed
let processed_sectors_file = output_dir.join("state.jsonl");
@@ -176,15 +180,19 @@ pub async fn update_securities(date_dir: &Path) -> anyhow::Result<()> {
let mut existing_companies = load_checkpoint_and_replay(&common_checkpoint, &common_log, "name").await?;
let mut existing_warrants = load_checkpoint_and_replay_nested(&warrants_checkpoint, &warrants_log).await?;
let mut existing_options = load_checkpoint_and_replay_nested(&options_checkpoint, &options_log).await?;
let mut existing_corporate_bonds = load_checkpoint_and_replay_nested(&corporate_bonds_checkpoint, &corporate_bonds_log).await?;
let mut existing_government_bonds = load_checkpoint_and_replay_nested(&government_bonds_checkpoint, &government_bonds_log).await?;
logger::log_info(&format!(" Existing entries - Companies: {}, Warrants: {}, Options: {}",
existing_companies.len(), existing_warrants.len(), existing_options.len())).await;
logger::log_info(&format!(" Existing entries - Companies: {}, Warrants: {}, Options: {}, Corporate Bonds: {}, Government Bonds: {}",
existing_companies.len(), existing_warrants.len(), existing_options.len(), existing_corporate_bonds.len(), existing_government_bonds.len())).await;
// Process statistics
let mut stats = StreamingStats::new(
existing_companies.len(),
existing_warrants.len(),
existing_options.len()
existing_options.len(),
existing_corporate_bonds.len(),
existing_government_bonds.len()
);
logger::log_info(&format!(" Found {} sectors to process", sectors_to_process.len())).await;
@@ -834,20 +842,29 @@ struct StreamingStats {
initial_companies: usize,
initial_warrants: usize,
initial_options: usize,
initial_corporate_bonds: usize,
initial_government_bonds: usize,
companies_added: usize,
warrants_added: usize,
options_added: usize,
corporate_bonds_added: usize,
government_bonds_added: usize,
}
impl StreamingStats {
fn new(companies: usize, warrants: usize, options: usize) -> Self {
fn new(companies: usize, warrants: usize, options: usize, corporate_bonds: usize, government_bonds: usize) -> Self {
Self {
initial_companies: companies,
initial_warrants: warrants,
initial_options: options,
initial_corporate_bonds: corporate_bonds,
initial_government_bonds: government_bonds,
companies_added: 0,
warrants_added: 0,
options_added: 0,
corporate_bonds_added: 0,
government_bonds_added: 0,
}
}
@@ -865,6 +882,14 @@ impl StreamingStats {
println!(" - Initial: {}", self.initial_options);
println!(" - Added: {}", self.options_added);
println!(" - Total: {}", self.initial_options + self.options_added);
println!("Corporate Bonds:");
println!(" - Initial: {}", self.initial_corporate_bonds);
println!(" - Added: {}", self.corporate_bonds_added);
println!(" - Total: {}", self.initial_corporate_bonds + self.corporate_bonds_added);
println!("Government Bonds:");
println!(" - Initial: {}", self.initial_government_bonds);
println!(" - Added: {}", self.government_bonds_added);
println!(" - Total: {}", self.initial_government_bonds + self.government_bonds_added);
}
}
@@ -1078,17 +1103,17 @@ async fn load_existing_mapped_leis(date_dir: &Path) -> anyhow::Result<HashSet<St
/// Read GLEIF CSV and return all LEIs (without loading entire file into memory)
async fn get_all_leis_from_gleif(csv_path: &str) -> anyhow::Result<HashSet<String>> {
let file = std::fs::File::open(csv_path)?;
let reader = BufReader::new(file);
let content = tokio::fs::read_to_string(csv_path)
.await
.context(format!("Failed to read GLEIF CSV file: {}", csv_path))?;
let mut all_leis = HashSet::new();
for (idx, line) in reader.lines().enumerate() {
for (idx, line) in content.lines().enumerate() {
if idx == 0 {
continue; // Skip header
}
let line = line?;
let parts: Vec<&str> = line.split(',').collect();
if parts.len() < 2 {
@@ -1147,8 +1172,9 @@ pub async fn stream_gleif_csv_and_build_figi_filtered(
) -> anyhow::Result<()> {
logger::log_info(&format!("Streaming GLEIF CSV: {}", csv_path)).await;
let file = std::fs::File::open(csv_path)?;
let reader = BufReader::new(file);
let content = tokio::fs::read_to_string(csv_path)
.await
.context(format!("Failed to read GLEIF CSV file: {}", csv_path))?;
let client = OpenFigiClient::new().await?;
if !client.has_key {
@@ -1171,9 +1197,7 @@ pub async fn stream_gleif_csv_and_build_figi_filtered(
let mut processed_leis = 0;
let mut skipped_leis = 0;
for (idx, line) in reader.lines().enumerate() {
let line = line?;
for (idx, line) in content.lines().enumerate() {
if idx == 0 { continue; }
let parts: Vec<&str> = line.split(',').collect();
@@ -1232,8 +1256,7 @@ pub async fn update_lei_mapping(
gleif_date: Option<&str>,
) -> anyhow::Result<bool> {
let dir = DataPaths::new(".")?;
let state_path = dir.cache_dir().join("state.jsonl");
let manager = StateManager::new(&dir.integrity_dir())?;
let manager = StateManager::new(&dir.integrity_dir()).await?;
let step_name = "lei_figi_mapping_complete";
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
@@ -1251,7 +1274,7 @@ pub async fn update_lei_mapping(
if unmapped.is_empty() {
logger::log_info("✓ All LEIs have been queried (mapped or confirmed no results)").await;
track_lei_mapping_completion(&manager, &date_dir).await?;
track_lei_mapping_completion(&manager, &dir.integrity_dir()).await?;
logger::log_info(" ✓ LEI-FIGI mapping marked as complete with integrity tracking").await;
return Ok(true);