Files
WebScraper/src/corporate/update.rs
2025-12-07 17:38:32 +01:00

211 lines
7.9 KiB
Rust

// src/corporate/update.rs
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
use crate::config::Config;
use crate::util::directories::DataPaths;
use crate::util::logger;
use crate::webdriver::webdriver::ChromeDriverPool;
use chrono::Local;
use std::collections::{HashMap};
use std::sync::Arc;
/// Main function: Full update for all companies (LEI-based) with optimized parallel execution.
///
/// This function coordinates the entire update process:
/// - Loads GLEIF mappings
/// - Builds FIGI-LEI map
/// - Loads existing events
/// - Processes each company: discovers exchanges via FIGI, fetches prices & earnings, aggregates data
/// - Uses the provided shared ChromeDriver pool for efficient parallel scraping
/// - Saves optimized events
///
/// # Arguments
/// * `config` - The application configuration.
/// * `pool` - Shared pool of ChromeDriver instances for scraping.
///
/// # Errors
/// Returns an error if any step in the update process fails.
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
let msg = "=== Starting LEI-based corporate full update ===";
println!("{}", msg);
logger::log_info(msg).await;
// Initialize paths
let paths = DataPaths::new(".")?;
// 1. Load fresh GLEIF ISIN ↔ LEI mapping
logger::log_info("Corporate Update: Loading GLEIF ISIN ↔ LEI mapping...").await;
let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
Ok(map) => {
let msg = format!("Corporate Update: Loaded GLEIF mapping with {} LEI entries", map.len());
println!("{}", msg);
logger::log_info(&msg).await;
map
}
Err(e) => {
let msg = format!("Corporate Update: Warning - Could not load GLEIF ISIN↔LEI mapping: {}", e);
eprintln!("{}", msg);
logger::log_warn(&msg).await;
HashMap::new()
}
};
// 2. Load OpenFIGI mapping value lists (cached)
logger::log_info("Corporate Update: Loading OpenFIGI type lists...").await;
if let Err(e) = load_figi_type_lists().await {
let msg = format!("Corporate Update: Warning - Could not load OpenFIGI type lists: {}", e);
eprintln!("{}", msg);
logger::log_warn(&msg).await;
}
logger::log_info("Corporate Update: OpenFIGI type lists loaded").await;
// 3. Build FIGI → LEI map
logger::log_info("Corporate Update: Building FIGI → LEI map...").await;
let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins, None).await {
Ok(map) => {
let msg = format!("Corporate Update: Built FIGI map with {} entries", map.len());
println!("{}", msg);
logger::log_info(&msg).await;
map
}
Err(e) => {
let msg = format!("Corporate Update: Warning - Could not build FIGI→LEI map: {}", e);
eprintln!("{}", msg);
logger::log_warn(&msg).await;
HashMap::new()
}
};
// 4. Load or build companies
logger::log_info("Corporate Update: Loading/building company securities...").await;
let securities = load_or_build_all_securities(&figi_to_lei).await?;
let msg = format!("Corporate Update: Processing {} companies", securities.0.len());
println!("{}", msg);
logger::log_info(&msg).await;
// HashMap<Name, HashMap<ISIN, Ticker>> - unique pairs only
let companies: HashMap<String, HashMap<String, String>> = securities.0
.iter()
.fold(HashMap::new(), |mut acc, security| {
let mut isin_ticker_pairs: HashMap<String, String> = HashMap::new();
// Collect all unique ISIN-Ticker pairs
for figi_infos in security.1.securities.values() {
for figi_info in figi_infos {
if !figi_info.isin.is_empty() && !figi_info.ticker.is_empty() {
isin_ticker_pairs.insert(figi_info.isin.clone(), figi_info.ticker.clone());
}
}
}
// Only add if there are pairs
if !isin_ticker_pairs.is_empty() {
acc.insert(security.1.name.clone(), isin_ticker_pairs);
}
acc
});
logger::log_info(&format!("Corporate Update: Saving {} companies to JSONL", companies.len())).await;
save_companies_to_jsonl(&paths, &companies).await.expect("Failed to save companies List.");
logger::log_info("Corporate Update: Companies saved successfully").await;
// 5. Load existing earnings events (for change detection)
logger::log_info("Corporate Update: Loading existing events...").await;
let existing_events = match load_existing_events(&paths).await {
Ok(events) => {
let msg = format!("Corporate Update: Loaded {} existing events", events.len());
println!("{}", msg);
logger::log_info(&msg).await;
events
}
Err(e) => {
let msg = format!("Corporate Update: Warning - Could not load existing events: {}", e);
eprintln!("{}", msg);
logger::log_warn(&msg).await;
HashMap::new()
}
};
// 5. Use the provided pool (no need to create a new one)
let pool_size = pool.get_number_of_instances(); // Use the size from the shared pool
logger::log_info(&format!("Corporate Update: Using pool size: {}", pool_size)).await;
// Process companies in parallel using the shared pool
/*let results: Vec<_> = stream::iter(companies.into_iter())
.map(|company| {
let pool_clone = pool.clone();
async move {
process_company_data(&company, &pool_clone, &mut existing_events).await
}
})
.buffer_unordered(pool_size)
.collect().await;
// Handle results (e.g., collect changes)
let mut all_changes = Vec::new();
for result in results {
if let Ok(ProcessResult { changes }) = result {
all_changes.extend(changes);
}
}*/
logger::log_info(&format!("Corporate Update: Saving {} events to optimized storage", existing_events.len())).await;
save_optimized_events(&paths, existing_events).await?;
logger::log_info("Corporate Update: Events saved successfully").await;
//save_changes(&all_changes).await?;
let msg = "✓ Corporate update complete";
println!("{}", msg);
logger::log_info(msg).await;
Ok(())
}
pub struct ProcessResult {
pub changes: Vec<CompanyEventChange>,
}
pub fn process_batch(
new_events: &[CompanyEvent],
existing: &mut HashMap<String, CompanyEvent>,
today: &str,
) -> ProcessResult {
let mut changes = Vec::new();
for new in new_events {
let key = event_key(new);
if let Some(old) = existing.get(&key) {
changes.extend(detect_changes(old, new, today));
existing.insert(key, new.clone());
continue;
}
// Check for time change on same date
let date_key = format!("{}|{}", new.ticker, new.date);
let mut found_old = None;
for (k, e) in existing.iter() {
if format!("{}|{}", e.ticker, e.date) == date_key && k != &key {
found_old = Some((k.clone(), e.clone()));
break;
}
}
if let Some((old_key, old_event)) = found_old {
if new.date.as_str() > today {
changes.push(CompanyEventChange {
ticker: new.ticker.clone(),
date: new.date.clone(),
field_changed: "time".to_string(),
old_value: old_event.time.clone(),
new_value: new.time.clone(),
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
});
}
existing.remove(&old_key);
}
existing.insert(key, new.clone());
}
ProcessResult { changes }
}