// src/corporate/collect_exchanges.rs use crate::util::directories::DataPaths; use crate::util::integrity::{DataStage, StateManager, file_reference}; use crate::util::logger; use crate::scraper::yahoo::ChartData; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use tokio::fs; use tokio::io::AsyncWriteExt; /// Exchange information collected from company data #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ExchangeInfo { #[serde(rename = "exchangeName")] pub exchange_name: String, pub currency: String, #[serde(rename = "currencySymbol")] pub currency_symbol: String, #[serde(rename = "exchangeDataDelayedBy")] pub exchange_data_delayed_by: i64, #[serde(rename = "totalMarketCap")] pub total_market_cap: u64, #[serde(rename = "totalMarketCapUSD")] pub total_market_cap_usd: f64, // NEW: Market cap converted to USD pub companies: Vec, } /// Extract exchange data from company core data #[derive(Debug, Deserialize)] struct CompanyCoreData { modules: Option, } #[derive(Debug, Deserialize)] struct CoreModules { price: Option, } #[derive(Debug, Deserialize)] struct PriceModule { #[serde(rename = "exchangeName")] exchange_name: Option, currency: Option, #[serde(rename = "currencySymbol")] currency_symbol: Option, exchange: Option, #[serde(rename = "exchangeDataDelayedBy")] exchange_data_delayed_by: Option, #[serde(rename = "marketCap")] market_cap: Option, } #[derive(Debug, Deserialize)] struct MarketCapData { raw: Option, } /// Normalize currency code and get conversion factor /// Handles special cases like GBp (pence) and ZAc (cents) fn normalize_currency(currency: &str) -> (&str, f64) { match currency { "GBp" => ("GBP", 100.0), // British Pence -> Pounds (divide by 100) "ZAc" => ("ZAR", 100.0), // South African Cents -> Rand (divide by 100) _ => (currency, 1.0), // No conversion needed } } /// FX rate cache for currency conversion struct FxRateCache { rates: HashMap, } impl FxRateCache { /// Create new FX rate cache by loading all currency charts async fn new(paths: &DataPaths) -> anyhow::Result { let mut rates = HashMap::new(); // USD to USD is always 1.0 rates.insert("USD".to_string(), 1.0); let currency_dir = paths.data_dir().join("economic").join("currency"); if !currency_dir.exists() { logger::log_warn(" FX rates directory not found - will use default rates").await; return Ok(Self { rates }); } let mut entries = fs::read_dir(¤cy_dir).await?; let mut loaded_count = 0; while let Some(entry) = entries.next_entry().await? { let path = entry.path(); if !path.is_dir() { continue; } let currency_code = match path.file_name().and_then(|n| n.to_str()) { Some(code) => code.to_string(), None => continue, }; let chart_path = path.join("chart").join("data.jsonl"); if !chart_path.exists() { continue; } // Load chart and get latest rate match load_latest_fx_rate(&chart_path).await { Ok(rate) => { rates.insert(currency_code.clone(), rate); loaded_count += 1; } Err(e) => { logger::log_warn(&format!( " Failed to load FX rate for {}: {}", currency_code, e )).await; } } } logger::log_info(&format!(" ✓ Loaded {} FX rates", loaded_count)).await; Ok(Self { rates }) } /// Convert amount from given currency to USD fn to_usd(&self, amount: u64, currency: &str) -> f64 { // Normalize currency and get conversion factor // e.g., GBp -> (GBP, 100.0), ZAc -> (ZAR, 100.0) let (normalized_currency, factor) = normalize_currency(currency); // First convert to base currency unit (e.g., pence to pounds) let amount_in_base = amount as f64 / factor; if normalized_currency == "USD" { return amount_in_base; } // Get rate (USD per currency unit) // For USD/EUR = 0.92, this means 1 USD = 0.92 EUR // To convert EUR to USD: EUR_amount / 0.92 match self.rates.get(normalized_currency) { Some(&rate) if rate > 0.0 => { amount_in_base / rate } _ => { // Fallback: use approximate rates for common currencies let fallback_rate = get_fallback_rate(normalized_currency); amount_in_base / fallback_rate } } } /// Get rate for a currency (USD per unit) fn get_rate(&self, currency: &str) -> Option { let (normalized_currency, _) = normalize_currency(currency); self.rates.get(normalized_currency).copied() } } /// Load latest FX rate from chart data async fn load_latest_fx_rate(chart_path: &std::path::Path) -> anyhow::Result { let content = fs::read_to_string(chart_path).await?; for line in content.lines() { if line.trim().is_empty() { continue; } let chart: ChartData = serde_json::from_str(line)?; if chart.quotes.is_empty() { return Err(anyhow::anyhow!("No quotes in chart data")); } // Get most recent quote with a close price let latest_rate = chart.quotes .iter() .rev() .find_map(|q| q.close) .ok_or_else(|| anyhow::anyhow!("No valid close prices"))?; return Ok(latest_rate); } Err(anyhow::anyhow!("No data in chart file")) } /// Fallback rates for common currencies (approximate, as of 2024) /// These are USD per currency unit (same format as our FX data) fn get_fallback_rate(currency: &str) -> f64 { match currency { "USD" => 1.0, "EUR" => 0.92, // 1 USD = 0.92 EUR "GBP" => 0.79, // 1 USD = 0.79 GBP "JPY" => 150.0, // 1 USD = 150 JPY "CNY" | "RMB" => 7.2, "CHF" => 0.88, "AUD" => 1.52, "CAD" => 1.36, "HKD" => 7.8, "SGD" => 1.34, "SEK" => 10.5, "NOK" => 10.8, "DKK" => 6.9, "PLN" => 4.0, "CZK" => 23.0, "TRY" => 32.0, "ZAR" => 18.5, "ILS" => 3.7, "RON" => 4.6, "KWD" => 0.31, "TWD" => 31.5, "ISK" => 138.0, "NZD" => 1.65, "MXN" => 17.0, "BRL" => 5.0, "INR" => 83.0, "KRW" => 1320.0, "THB" => 35.0, "MYR" => 4.6, "IDR" => 15700.0, "PHP" => 56.0, "VND" => 24500.0, _ => { // Default: assume similar to USD 1.0 } } } /// Collect all exchanges from company directories and create yahoo_exchanges.json /// /// # Features /// - Iterates through all company directories /// - Extracts exchange data from core/data.jsonl /// - Groups companies by exchange /// - Sums up market caps for each exchange /// - Converts all market caps to USD using FX rates /// - Saves consolidated mapping to data/yahoo_exchanges.json /// - Handles missing or invalid data gracefully /// - Integrity tracking with content hash validation pub async fn collect_and_save_exchanges(paths: &DataPaths) -> anyhow::Result { let manager = StateManager::new(paths.integrity_dir()).await?; let step_name = "exchange_collection_complete"; let output_path = paths.data_dir().join("yahoo_exchanges.json"); if manager.is_step_valid(step_name).await? { logger::log_info(" Exchange collection already completed and valid").await; // Load and count exchanges if output_path.exists() { let content = fs::read_to_string(&output_path).await?; let exchanges: HashMap = serde_json::from_str(&content)?; logger::log_info(&format!(" ✓ Found {} valid exchanges", exchanges.len())).await; return Ok(exchanges.len()); } } logger::log_info("Collecting exchange information from company directories...").await; let corporate_dir = paths.corporate_dir(); if !corporate_dir.exists() { logger::log_warn(" Corporate directory does not exist").await; return Ok(0); } // Load FX rates for currency conversion logger::log_info("Loading FX rates for currency conversion...").await; let fx_cache = FxRateCache::new(paths).await?; // Map of exchange code -> ExchangeInfo let mut exchanges: HashMap = HashMap::new(); let mut entries = fs::read_dir(&corporate_dir).await?; let mut processed_count = 0; let mut skipped_count = 0; while let Some(entry) = entries.next_entry().await? { let company_path = entry.path(); if !company_path.is_dir() { continue; } let company_name = match company_path.file_name().and_then(|n| n.to_str()) { Some(name) => name.to_string(), None => { skipped_count += 1; continue; } }; // Read core/data.jsonl let core_data_path = company_path.join("core").join("data.jsonl"); if !core_data_path.exists() { skipped_count += 1; continue; } // Parse core data match extract_exchange_info(&core_data_path, &company_name).await { Ok(Some((exchange_code, exchange_name, currency, currency_symbol, delay, market_cap))) => { // Convert market cap to USD let market_cap_usd = fx_cache.to_usd(market_cap, ¤cy); // Add or update exchange entry exchanges .entry(exchange_code.clone()) .and_modify(|info| { // Add company to existing exchange and sum market caps info.companies.push(company_name.clone()); info.total_market_cap = info.total_market_cap.saturating_add(market_cap); info.total_market_cap_usd += market_cap_usd; }) .or_insert_with(|| { // Create new exchange entry ExchangeInfo { exchange_name, currency, currency_symbol, exchange_data_delayed_by: delay, total_market_cap: market_cap, total_market_cap_usd: market_cap_usd, companies: vec![company_name.clone()], } }); processed_count += 1; } Ok(None) => { // No exchange data found skipped_count += 1; } Err(e) => { logger::log_warn(&format!( " Failed to parse exchange data for {}: {}", company_name, e )).await; skipped_count += 1; } } // Progress logging every 100 companies if (processed_count + skipped_count) % 100 == 0 { logger::log_info(&format!( " Progress: {} companies processed, {} skipped", processed_count, skipped_count )).await; } } logger::log_info(&format!( " ✓ Collected data from {} companies ({} skipped)", processed_count, skipped_count )).await; logger::log_info(&format!( " ✓ Found {} unique exchanges", exchanges.len() )).await; // Sort companies within each exchange for consistency for exchange_info in exchanges.values_mut() { exchange_info.companies.sort(); } // Save to yahoo_exchanges.json save_exchanges_json(&output_path, &exchanges).await?; logger::log_info(&format!( " ✓ Saved exchange mapping to {}", output_path.display() )).await; track_exchange_collection_completion(&manager, &output_path, step_name).await?; logger::log_info(" ✓ Exchange collection marked as complete with integrity tracking").await; // Print summary statistics print_exchange_statistics(&exchanges, &fx_cache).await; Ok(exchanges.len()) } /// Track exchange collection completion with content hash verification async fn track_exchange_collection_completion( manager: &StateManager, output_path: &std::path::Path, step_name: &str, ) -> anyhow::Result<()> { // Create content reference for the output file let content_reference = file_reference(output_path); // Track completion with: // - Content reference: The yahoo_exchanges.json file // - Data stage: Data (7-day TTL by default) // - Dependencies: None (this is a collection step, not dependent on other tracked steps) // Note: In practice, it depends on core data, but we track the output file // which will change if core data changes, so explicit dependency not needed manager.update_entry( step_name.to_string(), content_reference, DataStage::Data, None, // Use default TTL (7 days for Data stage) ).await?; Ok(()) } /// Extract exchange information from a company's core data file async fn extract_exchange_info( core_data_path: &std::path::Path, company_name: &str, ) -> anyhow::Result> { let content = fs::read_to_string(core_data_path).await?; // Parse JSONL - should be single line for line in content.lines() { if line.trim().is_empty() { continue; } match serde_json::from_str::(line) { Ok(data) => { // Extract from modules.price let price_module = match data.modules.and_then(|m| m.price) { Some(p) => p, None => return Ok(None), }; // Extract required fields let exchange = match price_module.exchange { Some(e) if !e.is_empty() => e, _ => return Ok(None), }; // Filter out invalid placeholder exchange codes if exchange == "CCC" { return Ok(None); } let exchange_name = price_module.exchange_name.unwrap_or_else(|| exchange.clone()); let currency = price_module.currency.unwrap_or_else(|| "USD".to_string()); let currency_symbol = price_module.currency_symbol.unwrap_or_else(|| "$".to_string()); let delay = price_module.exchange_data_delayed_by.unwrap_or(0); let market_cap = price_module .market_cap .and_then(|mc| mc.raw) .unwrap_or(0); return Ok(Some(( exchange, exchange_name, currency, currency_symbol, delay, market_cap, ))); } Err(e) => { // Try to parse as generic JSON to check if exchange field exists in modules.price if let Ok(json) = serde_json::from_str::(line) { // Try to access modules.price.exchange if let Some(price) = json.get("modules").and_then(|m| m.get("price")) { if let Some(exchange) = price.get("exchange").and_then(|v| v.as_str()) { if !exchange.is_empty() && exchange != "CCC" { let exchange_name = price .get("exchangeName") .and_then(|v| v.as_str()) .unwrap_or(exchange) .to_string(); let currency = price .get("currency") .and_then(|v| v.as_str()) .unwrap_or("USD") .to_string(); let currency_symbol = price .get("currencySymbol") .and_then(|v| v.as_str()) .unwrap_or("$") .to_string(); let delay = price .get("exchangeDataDelayedBy") .and_then(|v| v.as_i64()) .unwrap_or(0); let market_cap = price .get("marketCap") .and_then(|mc| mc.get("raw")) .and_then(|v| v.as_u64()) .unwrap_or(0); return Ok(Some(( exchange.to_string(), exchange_name, currency, currency_symbol, delay, market_cap, ))); } } } } return Err(anyhow::anyhow!( "Failed to parse core data for {}: {}", company_name, e )); } } } Ok(None) } /// Save exchanges map to JSON file with fsync async fn save_exchanges_json( path: &std::path::Path, exchanges: &HashMap, ) -> anyhow::Result<()> { // Create sorted output for consistency let mut sorted_exchanges: Vec<_> = exchanges.iter().collect(); sorted_exchanges.sort_by_key(|(code, _)| code.as_str()); let exchanges_map: HashMap = sorted_exchanges .into_iter() .map(|(k, v)| (k.clone(), v.clone())) .collect(); // Serialize with pretty printing let json_content = serde_json::to_string_pretty(&exchanges_map)?; // Write to temporary file first (atomic write pattern) let tmp_path = path.with_extension("json.tmp"); let mut file = fs::File::create(&tmp_path).await?; file.write_all(json_content.as_bytes()).await?; file.write_all(b"\n").await?; file.flush().await?; file.sync_all().await?; // Atomic rename fs::rename(&tmp_path, path).await?; Ok(()) } /// Format market cap as a human-readable string fn format_market_cap(market_cap: f64) -> String { if market_cap >= 1_000_000_000_000.0 { format!("{:.2}T", market_cap / 1_000_000_000_000.0) } else if market_cap >= 1_000_000_000.0 { format!("{:.2}B", market_cap / 1_000_000_000.0) } else if market_cap >= 1_000_000.0 { format!("{:.2}M", market_cap / 1_000_000.0) } else if market_cap >= 1_000.0 { format!("{:.2}K", market_cap / 1_000.0) } else { format!("{:.2}", market_cap) } } /// Print statistics about collected exchanges async fn print_exchange_statistics(exchanges: &HashMap, fx_cache: &FxRateCache) { logger::log_info("Exchange Statistics (sorted by USD market cap):").await; // Sort by total market cap in USD (descending) let mut exchange_list: Vec<_> = exchanges.iter().collect(); exchange_list.sort_by(|a, b| { b.1.total_market_cap_usd .partial_cmp(&a.1.total_market_cap_usd) .unwrap_or(std::cmp::Ordering::Equal) }); // Print top 20 exchanges by total market cap (USD) logger::log_info(" Top 20 exchanges by total market cap (USD):").await; for (i, (code, info)) in exchange_list.iter().take(20).enumerate() { let (normalized_currency, factor) = normalize_currency(&info.currency); let fx_rate = fx_cache.get_rate(&info.currency); let fx_info = match fx_rate { Some(rate) => { if factor > 1.0 { // Show conversion for pence/cents format!(" (1 {} = {} {}, {} {} = 1 {})", normalized_currency, format!("{:.4}", rate), "USD", factor as i32, info.currency, normalized_currency) } else { format!(" (1 USD = {:.4} {})", rate, info.currency) } } None => format!(" (using fallback rate for {})", info.currency), }; logger::log_info(&format!( " {}. {} ({}) - ${} USD ({}{} {}) - {} companies{}", i + 1, info.exchange_name, code, format_market_cap(info.total_market_cap_usd), info.currency_symbol, format_market_cap(info.total_market_cap as f64), info.currency, info.companies.len(), if info.currency != "USD" { &fx_info } else { "" } )).await; } // Count by currency let mut currency_counts: HashMap = HashMap::new(); let mut currency_market_caps: HashMap = HashMap::new(); for info in exchanges.values() { *currency_counts.entry(info.currency.clone()).or_insert(0) += info.companies.len(); *currency_market_caps.entry(info.currency.clone()).or_insert(0.0) += info.total_market_cap_usd; } let mut currencies: Vec<_> = currency_counts.iter().collect(); currencies.sort_by(|a, b| { currency_market_caps.get(b.0) .unwrap_or(&0.0) .partial_cmp(currency_market_caps.get(a.0).unwrap_or(&0.0)) .unwrap_or(std::cmp::Ordering::Equal) }); logger::log_info(" Market cap by currency (USD equivalent):").await; for (currency, count) in currencies.iter().take(10) { let market_cap_usd = currency_market_caps.get(*currency).unwrap_or(&0.0); let (normalized_currency, factor) = normalize_currency(currency); let fx_rate = fx_cache.get_rate(currency); let fx_info = match fx_rate { Some(rate) => { if factor > 1.0 { format!(" (1 {} = {:.4} USD, {} {} = 1 {})", normalized_currency, rate, factor as i32, currency, normalized_currency) } else { format!(" (1 USD = {:.4} {})", rate, currency) } } None => format!(" (fallback)"), }; logger::log_info(&format!( " {}: {} companies, ${} USD{}", currency, count, format_market_cap(*market_cap_usd), if *currency != "USD" { &fx_info } else { "" } )).await; } // Delay statistics let delayed_exchanges: Vec<_> = exchanges .iter() .filter(|(_, info)| info.exchange_data_delayed_by > 0) .collect(); if !delayed_exchanges.is_empty() { logger::log_info(&format!( " Exchanges with data delay: {} (out of {})", delayed_exchanges.len(), exchanges.len() )).await; } // Total market cap across all exchanges (in USD) let total_market_cap_usd: f64 = exchanges.values() .map(|info| info.total_market_cap_usd) .sum(); logger::log_info(&format!( " Total market cap across all exchanges: ${} USD", format_market_cap(total_market_cap_usd) )).await; } /// Get exchange information for a specific exchange code pub async fn get_exchange_info( paths: &DataPaths, exchange_code: &str, ) -> anyhow::Result> { let exchanges_path = paths.data_dir().join("yahoo_exchanges.json"); if !exchanges_path.exists() { return Ok(None); } let content = fs::read_to_string(&exchanges_path).await?; let exchanges: HashMap = serde_json::from_str(&content)?; Ok(exchanges.get(exchange_code).cloned()) } /// List all available exchanges pub async fn list_all_exchanges(paths: &DataPaths) -> anyhow::Result> { let exchanges_path = paths.data_dir().join("yahoo_exchanges.json"); if !exchanges_path.exists() { return Ok(Vec::new()); } let content = fs::read_to_string(&exchanges_path).await?; let exchanges: HashMap = serde_json::from_str(&content)?; let mut exchange_list: Vec<_> = exchanges.into_iter().collect(); exchange_list.sort_by(|a, b| a.0.cmp(&b.0)); Ok(exchange_list) }