commented unused function

This commit is contained in:
2025-12-09 16:56:45 +01:00
parent c00bfd8687
commit f95e9e2427
4 changed files with 308 additions and 181 deletions

View File

@@ -6,6 +6,8 @@ use super::{types::*};
use reqwest::Client as HttpClient;
use reqwest::header::{HeaderMap, HeaderValue};
use serde_json::{json, Value};
use csv::{ReaderBuilder, StringRecord, WriterBuilder};
use chrono::NaiveDate;
use std::collections::{HashMap, HashSet};
use std::path::{Path};
use std::time::Instant;
@@ -203,12 +205,12 @@ impl OpenFigiClient {
name: item["name"].as_str().unwrap_or("").to_string(),
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
exch_code: item["exchCode"].as_str().unwrap_or("").to_string(),
compositeFIGI: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
securityType: sec_type.to_string(),
marketSector: market_sec.to_string(),
shareClassFIGI: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
securityType2: item["securityType2"].as_str().unwrap_or("").to_string(),
securityDescription: item["securityDescription"].as_str().unwrap_or("").to_string(),
composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
security_type: sec_type.to_string(),
market_sector: market_sec.to_string(),
share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
security_type2: item["securityType2"].as_str().unwrap_or("").to_string(),
security_description: item["securityDescription"].as_str().unwrap_or("").to_string(),
};
all_figi_infos.push(figi_info);
@@ -318,12 +320,43 @@ async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
/// Returns the extracted date in format "DDMMYYYY" from the filename.
/// If no GLEIF file is found, returns None.
async fn find_most_recent_gleif_date(gleif_cache_dir: &Path) -> anyhow::Result<Option<String>> {
// First check for subdirectories named as DDMMYYYY and pick the most recent date
let mut dir_entries = tokio_fs::read_dir(gleif_cache_dir)
.await
.context("Failed to read gleif cache directory")?;
let mut found_dates: Vec<NaiveDate> = Vec::new();
while let Some(entry) = dir_entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
// Expect folder name in DDMMYYYY
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
if let Ok(nd) = NaiveDate::parse_from_str(name, "%d%m%Y") {
found_dates.push(nd);
}
}
}
}
}
if !found_dates.is_empty() {
found_dates.sort();
if let Some(most_recent) = found_dates.last() {
let date_str = most_recent.format("%d%m%Y").to_string();
let msg = format!(" Found GLEIF data dated (from subdirs): {}", date_str);
logger::log_info(&msg).await;
return Ok(Some(date_str));
}
}
// Fallback: look for CSV files in the directory as before
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
.await
.context("Failed to read gleif cache directory")?;
let mut csv_files = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(filename) = path.file_name() {
@@ -333,20 +366,20 @@ async fn find_most_recent_gleif_date(gleif_cache_dir: &Path) -> anyhow::Result<O
}
}
}
if csv_files.is_empty() {
return Ok(None);
}
// Sort files in reverse order (most recent first) based on date in filename
csv_files.sort();
csv_files.reverse();
let most_recent = &csv_files[0];
let date = extract_gleif_date_from_filename(most_recent);
let msg = format!(" Found GLEIF data dated: {}", date);
logger::log_info(&msg).await;
Ok(Some(date))
}
@@ -434,7 +467,7 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
let dir = DataPaths::new(".")?;
let gleif_cache_dir = dir.cache_gleif_dir();
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
// Determine the GLEIF date to use
let date = if let Some(d) = gleif_date {
let msg = format!("Using provided GLEIF date: {}", d);
@@ -443,7 +476,7 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
} else {
// Find the most recent GLEIF file
logger::log_info("Searching for most recent GLEIF file...").await;
match find_most_recent_gleif_date(gleif_cache_dir).await? {
match find_most_recent_gleif_date(&gleif_cache_dir).await? {
Some(d) => d,
None => {
let err = "No GLEIF CSV file found in cache/gleif directory";
@@ -452,7 +485,10 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
},
}
};
// Creat date-based subdirectory in the gleif cache
let gleif_date_dir = gleif_cache_dir.join(&date);
// Create date-based subdirectory in the mapping cache
let msg = format!("Creating date directory for: {}", date);
logger::log_info(&msg).await;
@@ -500,6 +536,7 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
let mut processed = sector_maps.values().map(|m| m.len()).sum::<usize>();
let total = leis.len();
let mut no_hit_leis = Vec::new(); // Track LEIs with no data found (no_hit)
let mut leis_to_delete_batch = Vec::new(); // Batch delete every 100 LEIs
let msg = format!("Total LEIs to process: {}, already processed: {}", total, processed);
@@ -535,12 +572,18 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
let no_hit_msg = format!(" no_hit: LEI {} returned no FIGIs", lei);
logger::log_warn(&no_hit_msg).await;
no_hit_leis.push(lei.clone());
leis_to_delete_batch.push(lei.clone());
// Remove immediately from GLEIF CSV to prevent progress loss on interrupt
if let Err(e) = remove_lei_from_gleif_csv_single(gleif_cache_dir, &lei).await {
let warn_msg = format!("Warning: Failed to remove LEI {} from GLEIF CSV: {}", lei, e);
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
// Delete every 100 no_hit LEIs to prevent progress loss on interrupt
if leis_to_delete_batch.len() >= 100 {
let batch_msg = format!("Batch deleting {} LEIs from GLEIF CSV...", leis_to_delete_batch.len());
logger::log_info(&batch_msg).await;
if let Err(e) = remove_leis_batch_from_gleif_csv(&gleif_date_dir, &leis_to_delete_batch).await {
let warn_msg = format!("Warning: Failed to batch remove LEIs from GLEIF CSV: {}", e);
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
}
leis_to_delete_batch.clear();
}
continue;
@@ -554,7 +597,7 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
let mut uncategorized_figis = Vec::new();
for figi_info in all_figi_infos {
let sector = figi_info.marketSector.clone();
let sector = figi_info.market_sector.clone();
if sector.is_empty() {
// Case 2: Hit but no marketSecDes - save to uncategorized
@@ -610,9 +653,20 @@ async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<Str
tokio::time::sleep(Duration::from_millis(100)).await;
}
// Delete any remaining LEIs in the batch
if !leis_to_delete_batch.is_empty() {
let batch_msg = format!("Final batch: Deleting {} LEIs from GLEIF CSV...", leis_to_delete_batch.len());
logger::log_info(&batch_msg).await;
if let Err(e) = remove_leis_batch_from_gleif_csv(gleif_cache_dir, &leis_to_delete_batch).await {
let warn_msg = format!("Warning: Failed to delete final batch from GLEIF CSV: {}", e);
eprintln!("{}", warn_msg);
logger::log_warn(&warn_msg).await;
}
}
// Log final summary for no_hit LEIs (they've already been removed incrementally)
if !no_hit_leis.is_empty() {
let no_hit_summary = format!("no_hit (removed incrementally from GLEIF): {} LEIs", no_hit_leis.len());
let no_hit_summary = format!("no_hit (removed in batches from GLEIF): {} LEIs", no_hit_leis.len());
println!("{}", no_hit_summary);
logger::log_info(&no_hit_summary).await;
}
@@ -696,81 +750,10 @@ async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) ->
Ok(())
}
/// Removes a single invalid LEI from the GLEIF CSV file immediately.
/// Removes multiple invalid LEIs from the GLEIF CSV file in a single batch operation.
///
/// This function is called after each no_hit detection to prevent progress loss on interrupt.
/// It reads the GLEIF CSV, filters out the specific LEI, and overwrites the file.
///
/// # Arguments
///
/// * `gleif_cache_dir` - Path to the cache/gleif directory
/// * `lei` - The LEI string to remove
///
/// # Returns
/// Ok(()) if successful, Err if file operations fail.
async fn remove_lei_from_gleif_csv_single(gleif_cache_dir: &Path, lei: &str) -> anyhow::Result<()> {
// Find the most recent GLEIF CSV file
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
.await
.context("Failed to read gleif cache directory")?;
let mut csv_files = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(filename) = path.file_name() {
let filename_str = filename.to_string_lossy();
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
csv_files.push(path);
}
}
}
if csv_files.is_empty() {
return Ok(());
}
// Sort and get the most recent
csv_files.sort();
csv_files.reverse();
let gleif_file = &csv_files[0];
// Read the CSV file
let content = tokio_fs::read_to_string(gleif_file)
.await
.context("Failed to read GLEIF CSV")?;
// Filter out line with this LEI
let filtered_lines: Vec<&str> = content
.lines()
.filter(|line| {
// GLEIF CSV format: ISIN,LEI
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 2 {
parts[1] != lei
} else {
true // Keep lines that don't match the format (e.g., header)
}
})
.collect();
// Only write if something was actually removed
if filtered_lines.len() < content.lines().count() {
let new_content = filtered_lines.join("\n") + "\n";
tokio_fs::write(gleif_file, new_content)
.await
.context("Failed to write filtered GLEIF CSV")?;
}
Ok(())
}
/// Removes invalid LEIs from the GLEIF CSV file.
///
/// When an API call succeeds but returns no data (no_hit), the LEI is considered invalid
/// and should be removed from the GLEIF CSV to prevent re-scraping on future runs.
///
/// This function reads the GLEIF CSV, filters out the specified LEIs, and overwrites the file.
/// This function is more efficient than removing LEIs one at a time.
/// It reads the GLEIF CSV once, filters out all specified LEIs, and overwrites the file once.
///
/// # Arguments
///
@@ -779,8 +762,10 @@ async fn remove_lei_from_gleif_csv_single(gleif_cache_dir: &Path, lei: &str) ->
///
/// # Returns
/// Ok(()) if successful, Err if file operations fail.
async fn remove_leis_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove: &[String]) -> anyhow::Result<()> {
logger::log_info(&format!("Removing {} invalid LEIs from GLEIF CSV...", leis_to_remove.len())).await;
async fn remove_leis_batch_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove: &[String]) -> anyhow::Result<()> {
if leis_to_remove.is_empty() {
return Ok(());
}
// Find the most recent GLEIF CSV file
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
@@ -788,7 +773,7 @@ async fn remove_leis_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove: &[St
.context("Failed to read gleif cache directory")?;
let mut csv_files = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(filename) = path.file_name() {
@@ -800,53 +785,151 @@ async fn remove_leis_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove: &[St
}
if csv_files.is_empty() {
logger::log_warn("No GLEIF CSV files found for removal operation").await;
logger::log_warn("No GLEIF CSV files found for batch removal operation").await;
return Ok(());
}
// Sort and get the most recent
// Prefer an original (non-_clean) GLEIF CSV if available; otherwise use the most recent file.
csv_files.sort();
csv_files.reverse();
let gleif_file = &csv_files[0];
let debug_msg = format!("Reading GLEIF file: {}", gleif_file.display());
let mut gleif_file: &std::path::PathBuf = &csv_files[0];
// Try to find the most recent filename that does NOT end with "_clean.csv"
if let Some(non_clean) = csv_files.iter().find(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|s| !s.to_lowercase().ends_with("_clean.csv"))
.unwrap_or(false)
}) {
gleif_file = non_clean;
}
// Prepare clean file path: insert "_clean" before extension
let orig_path = gleif_file;
let file_name = orig_path.file_name().and_then(|n| n.to_str()).unwrap_or("gleif.csv");
let mut stem = orig_path.file_stem().and_then(|s| s.to_str()).unwrap_or("isin-lei").to_string();
let parent = orig_path.parent().unwrap_or_else(|| Path::new("."));
// Avoid creating a double "_clean_clean.csv". If stem already ends with "_clean", keep it.
if stem.to_lowercase().ends_with("_clean") {
// stem is already clean; keep same filename (no double suffix)
// e.g., stem="isin-lei-24112025_clean" -> clean_name="isin-lei-24112025_clean.csv"
} else {
stem = format!("{}_clean", stem);
}
let clean_name = format!("{}.csv", stem);
let clean_path = parent.join(&clean_name);
// If a clean file already exists, operate on it; otherwise read original and write clean file
let source_path = if clean_path.exists() { &clean_path } else { orig_path };
let debug_msg = format!("Reading GLEIF source for batch removal: {} (writing to {})", source_path.display(), clean_path.display());
logger::log_info(&debug_msg).await;
// Read the CSV file
let content = tokio_fs::read_to_string(gleif_file)
.await
.context("Failed to read GLEIF CSV")?;
let original_lines = content.lines().count();
// Convert LEIs to remove into a HashSet for faster lookup
let remove_set: std::collections::HashSet<_> = leis_to_remove.iter().cloned().collect();
// Filter out lines with LEIs to remove
let filtered_lines: Vec<&str> = content
.lines()
.filter(|line| {
// GLEIF CSV format: ISIN,LEI
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 2 {
!remove_set.contains(parts[1])
} else {
true // Keep lines that don't match the format (e.g., header)
// Cleanup any accidental double-clean files in the same directory: if a file ends with
// "_clean_clean.csv" replace it with single "_clean.csv" or remove it if target exists.
if let Ok(mut dir_entries) = tokio_fs::read_dir(parent).await {
while let Ok(Some(entry)) = dir_entries.next_entry().await {
if let Some(name) = entry.file_name().to_str().map(|s| s.to_string()) {
if name.to_lowercase().ends_with("_clean_clean.csv") {
let offending = entry.path();
let candidate = offending.file_name().and_then(|n| n.to_str()).unwrap_or("");
let target_name = candidate.replacen("_clean_clean.csv", "_clean.csv", 1);
let target_path = parent.join(target_name);
if !target_path.exists() {
// Rename offending -> target
let _ = tokio_fs::rename(&offending, &target_path).await;
let msg = format!("Renamed {} -> {}", offending.display(), target_path.display());
logger::log_info(&msg).await;
} else {
// Target exists already; remove offending duplicate
let _ = tokio_fs::remove_file(&offending).await;
let msg = format!("Removed duplicate {}", offending.display());
logger::log_info(&msg).await;
}
}
}
})
.collect();
let removed_count = original_lines - filtered_lines.len();
// Write back the filtered content
let new_content = filtered_lines.join("\n") + "\n";
tokio_fs::write(gleif_file, new_content)
}
}
// Read file into memory and parse with csv crate for robust handling of quoted fields
let content = tokio_fs::read_to_string(source_path)
.await
.context("Failed to write filtered GLEIF CSV")?;
let success_msg = format!("✓ Removed {} invalid LEIs from GLEIF CSV (was {} lines, now {} lines)", leis_to_remove.len(), original_lines, filtered_lines.len());
.context("Failed to read GLEIF CSV source")?;
// Convert LEIs to remove into a HashSet (normalized)
let remove_set: std::collections::HashSet<String> = leis_to_remove
.iter()
.map(|s| s.trim().trim_matches('"').to_uppercase())
.collect();
// Build CSV reader: try with headers first; allow flexible records
let mut reader = ReaderBuilder::new()
.has_headers(true)
.flexible(true)
.from_reader(content.as_bytes());
// Remember headers (if present) and then iterate records.
let headers_record = match reader.headers() {
Ok(h) => Some(h.clone()),
Err(_) => None,
};
// We'll collect kept records and count original rows
let mut kept_records: Vec<StringRecord> = Vec::new();
let mut original_count: usize = 0;
let mut removed_count: usize = 0;
// For robustness, search all columns for a matching LEI instead of relying on a single column index.
for result in reader.records() {
let record = result.context("Failed to parse CSV record")?;
original_count += 1;
// Check every field for a match in the remove set
let mut matched = false;
for field in record.iter() {
let norm = field.trim().trim_matches('"').to_uppercase();
if remove_set.contains(&norm) {
matched = true;
break;
}
}
if matched {
removed_count += 1;
} else {
kept_records.push(record.clone());
}
}
let new_count = kept_records.len();
// Write back using csv writer to preserve quoting/format into clean file
let mut wtr = WriterBuilder::new().has_headers(true).from_writer(vec![]);
// If original had headers, write them back
if let Some(headers) = headers_record {
wtr.write_record(headers.iter())?;
}
for rec in &kept_records {
wtr.write_record(rec.iter())?;
}
let out_bytes = wtr.into_inner().context("Failed to finalize CSV writer")?;
let out_str = String::from_utf8(out_bytes).context("CSV output not valid UTF-8")?;
tokio_fs::write(&clean_path, out_str)
.await
.context("Failed to write filtered GLEIF CSV clean file")?;
let success_msg = format!(
"✓ Batch attempted to remove {} LEIs from GLEIF CSV (was {} records, now {} records, removed {} rows) -> {}",
leis_to_remove.len(), original_count, new_count, removed_count, clean_path.display()
);
println!("{}", success_msg);
logger::log_info(&success_msg).await;
Ok(())
}
@@ -907,7 +990,7 @@ pub async fn load_or_build_all_securities(
let mut option_securities = Vec::new();
for figi_info in figi_infos {
match figi_info.securityType.as_str() {
match figi_info.security_type.as_str() {
"Common Stock" => common_stocks.push(figi_info.clone()),
"Equity WRT" => warrant_securities.push(figi_info.clone()),
"Equity Option" => option_securities.push(figi_info.clone()),