589 lines
21 KiB
Rust
589 lines
21 KiB
Rust
// src/corporate/update.rs - UPDATED WITH DATA INTEGRITY FIXES
|
|
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
|
|
use crate::config::Config;
|
|
use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel;
|
|
use crate::util::directories::DataPaths;
|
|
use crate::util::logger;
|
|
use crate::scraper::webdriver::ChromeDriverPool;
|
|
|
|
use chrono::Local;
|
|
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::{AtomicBool, Ordering};
|
|
|
|
/// UPDATED: Main corporate update entry point with shutdown awareness
|
|
pub async fn run_full_update(
|
|
_config: &Config,
|
|
pool: &Arc<ChromeDriverPool>,
|
|
shutdown_flag: &Arc<AtomicBool>,
|
|
) -> anyhow::Result<()> {
|
|
logger::log_info("=== Corporate Update (STREAMING MODE WITH DATA INTEGRITY) ===").await;
|
|
|
|
let paths = DataPaths::new(".")?;
|
|
|
|
logger::log_info("Step 1: Downloading GLEIF CSV...").await;
|
|
let gleif_csv_path = match download_isin_lei_csv().await? {
|
|
Some(p) => {
|
|
logger::log_info(&format!(" ✓ GLEIF CSV at: {}", p)).await;
|
|
p
|
|
}
|
|
None => {
|
|
logger::log_warn(" ✗ Could not obtain GLEIF CSV").await;
|
|
return Ok(());
|
|
}
|
|
};
|
|
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_warn("Shutdown detected after GLEIF download").await;
|
|
return Ok(());
|
|
}
|
|
|
|
logger::log_info("Step 2: Loading OpenFIGI metadata...").await;
|
|
load_figi_type_lists().await.ok();
|
|
logger::log_info(" ✓ OpenFIGI metadata loaded").await;
|
|
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_warn("Shutdown detected after OpenFIGI load").await;
|
|
return Ok(());
|
|
}
|
|
|
|
logger::log_info("Step 3: Checking LEI-FIGI mapping status...").await;
|
|
let all_mapped = ensure_all_leis_mapped(&gleif_csv_path, None).await?;
|
|
|
|
if !all_mapped {
|
|
logger::log_warn(" ⚠ Some LEIs failed to map - continuing with partial data").await;
|
|
} else {
|
|
logger::log_info(" ✓ All LEIs successfully mapped").await;
|
|
}
|
|
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_warn("Shutdown detected after LEI-FIGI mapping").await;
|
|
return Ok(());
|
|
}
|
|
|
|
logger::log_info("Step 4: Building securities map (streaming)...").await;
|
|
let date_dir = find_most_recent_figi_date_dir(&paths).await?;
|
|
|
|
if let Some(date_dir) = date_dir {
|
|
logger::log_info(&format!(" Using FIGI data from: {:?}", date_dir)).await;
|
|
build_securities_from_figi_streaming(&date_dir).await?;
|
|
logger::log_info(" ✓ Securities map updated").await;
|
|
} else {
|
|
logger::log_warn(" ✗ No FIGI data directory found").await;
|
|
}
|
|
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_warn("Shutdown detected after securities map build").await;
|
|
return Ok(());
|
|
}
|
|
|
|
logger::log_info("Step 5: Building companies.jsonl with parallel processing and validation...").await;
|
|
let count = build_companies_jsonl_streaming_parallel(&paths, pool, shutdown_flag).await?;
|
|
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
|
|
|
if !shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_info("Step 6: Processing events (using index)...").await;
|
|
let _event_index = build_event_index(&paths).await?;
|
|
logger::log_info(" ✓ Event index built").await;
|
|
} else {
|
|
logger::log_warn("Shutdown detected, skipping event index build").await;
|
|
}
|
|
|
|
logger::log_info("✓ Corporate update complete").await;
|
|
Ok(())
|
|
}
|
|
|
|
/// UPDATED: Serial version with validation (kept for compatibility/debugging)
|
|
///
|
|
/// This is the non-parallel version that processes companies sequentially.
|
|
/// Updated with same validation and shutdown checks as parallel version.
|
|
///
|
|
/// Use this for:
|
|
/// - Debugging issues with specific companies
|
|
/// - Environments where parallel processing isn't desired
|
|
/// - Testing validation logic without concurrency complexity
|
|
async fn build_companies_jsonl_streaming_serial(
|
|
paths: &DataPaths,
|
|
pool: &Arc<ChromeDriverPool>,
|
|
shutdown_flag: &Arc<AtomicBool>,
|
|
) -> anyhow::Result<usize> {
|
|
// Configuration constants
|
|
const CHECKPOINT_INTERVAL: usize = 50;
|
|
const FSYNC_BATCH_SIZE: usize = 10;
|
|
const FSYNC_INTERVAL_SECS: u64 = 10;
|
|
|
|
let path = DataPaths::new(".")?;
|
|
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
|
let securities_path = corporate_path.join("common_stocks.json");
|
|
|
|
if !securities_path.exists() {
|
|
logger::log_warn("No common_stocks.json found").await;
|
|
return Ok(0);
|
|
}
|
|
|
|
let content = tokio::fs::read_to_string(securities_path).await?;
|
|
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
|
|
|
|
let companies_path = paths.data_dir().join("companies.jsonl");
|
|
let log_path = paths.data_dir().join("companies_updates.log");
|
|
|
|
if let Some(parent) = companies_path.parent() {
|
|
tokio::fs::create_dir_all(parent).await?;
|
|
}
|
|
|
|
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
|
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
|
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
|
|
|
if companies_path.exists() {
|
|
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
|
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
|
|
|
for line in existing_content.lines() {
|
|
if line.trim().is_empty() {
|
|
continue;
|
|
}
|
|
|
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
|
Ok(company) => {
|
|
processed_names.insert(company.name.clone());
|
|
existing_companies.insert(company.name.clone(), company);
|
|
}
|
|
Err(e) => {
|
|
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
|
}
|
|
}
|
|
}
|
|
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
|
}
|
|
|
|
if log_path.exists() {
|
|
logger::log_info("Replaying update log...").await;
|
|
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
|
let mut replayed = 0;
|
|
|
|
for line in log_content.lines() {
|
|
if line.trim().is_empty() {
|
|
continue;
|
|
}
|
|
|
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
|
Ok(company) => {
|
|
processed_names.insert(company.name.clone());
|
|
existing_companies.insert(company.name.clone(), company);
|
|
replayed += 1;
|
|
}
|
|
Err(e) => {
|
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
|
}
|
|
}
|
|
}
|
|
if replayed > 0 {
|
|
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
|
}
|
|
}
|
|
|
|
// === OPEN LOG FILE ===
|
|
use tokio::fs::OpenOptions;
|
|
use tokio::io::AsyncWriteExt;
|
|
|
|
let mut log_file = OpenOptions::new()
|
|
.create(true)
|
|
.append(true)
|
|
.open(&log_path)
|
|
.await?;
|
|
|
|
let mut writes_since_fsync = 0;
|
|
let mut last_fsync = std::time::Instant::now();
|
|
let mut updates_since_checkpoint = 0;
|
|
let mut count = 0;
|
|
let mut new_count = 0;
|
|
let mut updated_count = 0;
|
|
|
|
logger::log_info(&format!("Processing {} companies sequentially...", securities.len())).await;
|
|
|
|
// === PROCESS COMPANIES SEQUENTIALLY ===
|
|
for (name, company_info) in securities.clone() {
|
|
// Check shutdown before each company
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_warn(&format!(
|
|
"Shutdown detected at company: {} (progress: {}/{})",
|
|
name, count, count + securities.len()
|
|
)).await;
|
|
break;
|
|
}
|
|
|
|
let existing_entry = existing_companies.get(&name).cloned();
|
|
let is_update = existing_entry.is_some();
|
|
|
|
// Process company with validation
|
|
match process_single_company_serial(
|
|
name.clone(),
|
|
company_info,
|
|
existing_entry,
|
|
pool,
|
|
shutdown_flag,
|
|
).await {
|
|
Ok(Some(company_entry)) => {
|
|
// Write to log
|
|
let line = serde_json::to_string(&company_entry)?;
|
|
log_file.write_all(line.as_bytes()).await?;
|
|
log_file.write_all(b"\n").await?;
|
|
|
|
writes_since_fsync += 1;
|
|
|
|
// Batched + time-based fsync
|
|
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|
|
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
|
|
|
|
if should_fsync {
|
|
log_file.flush().await?;
|
|
log_file.sync_data().await?;
|
|
writes_since_fsync = 0;
|
|
last_fsync = std::time::Instant::now();
|
|
}
|
|
|
|
// Update in-memory state
|
|
processed_names.insert(name.clone());
|
|
existing_companies.insert(name.clone(), company_entry);
|
|
|
|
count += 1;
|
|
updates_since_checkpoint += 1;
|
|
|
|
if is_update {
|
|
updated_count += 1;
|
|
} else {
|
|
new_count += 1;
|
|
}
|
|
|
|
// Periodic checkpoint
|
|
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
|
|
if writes_since_fsync > 0 {
|
|
log_file.flush().await?;
|
|
log_file.sync_data().await?;
|
|
writes_since_fsync = 0;
|
|
last_fsync = std::time::Instant::now();
|
|
}
|
|
|
|
logger::log_info(&format!("Creating checkpoint at {} companies...", count)).await;
|
|
|
|
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
|
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
|
|
|
for company in existing_companies.values() {
|
|
let line = serde_json::to_string(company)?;
|
|
checkpoint_file.write_all(line.as_bytes()).await?;
|
|
checkpoint_file.write_all(b"\n").await?;
|
|
}
|
|
|
|
checkpoint_file.flush().await?;
|
|
checkpoint_file.sync_all().await?;
|
|
drop(checkpoint_file);
|
|
|
|
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
|
|
|
drop(log_file);
|
|
tokio::fs::remove_file(&log_path).await.ok();
|
|
log_file = OpenOptions::new()
|
|
.create(true)
|
|
.append(true)
|
|
.open(&log_path)
|
|
.await?;
|
|
|
|
updates_since_checkpoint = 0;
|
|
logger::log_info("✓ Checkpoint created and log cleared").await;
|
|
}
|
|
|
|
if count % 10 == 0 {
|
|
logger::log_info(&format!(
|
|
"Progress: {} companies ({} new, {} updated)",
|
|
count, new_count, updated_count
|
|
)).await;
|
|
}
|
|
}
|
|
Ok(None) => {
|
|
// Company had no ISINs or was skipped
|
|
logger::log_info(&format!("Skipped company: {} (no ISINs)", name)).await;
|
|
}
|
|
Err(e) => {
|
|
logger::log_warn(&format!("Error processing company {}: {}", name, e)).await;
|
|
}
|
|
}
|
|
|
|
// Time-based fsync
|
|
if writes_since_fsync > 0 && last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS {
|
|
log_file.flush().await?;
|
|
log_file.sync_data().await?;
|
|
writes_since_fsync = 0;
|
|
last_fsync = std::time::Instant::now();
|
|
}
|
|
}
|
|
|
|
// === FSYNC PENDING WRITES ===
|
|
if writes_since_fsync > 0 {
|
|
logger::log_info(&format!("Fsyncing {} pending writes...", writes_since_fsync)).await;
|
|
log_file.flush().await?;
|
|
log_file.sync_data().await?;
|
|
logger::log_info("✓ Pending writes saved").await;
|
|
}
|
|
|
|
// === FINAL CHECKPOINT ===
|
|
if !shutdown_flag.load(Ordering::SeqCst) && updates_since_checkpoint > 0 {
|
|
logger::log_info("Creating final checkpoint...").await;
|
|
|
|
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
|
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
|
|
|
for company in existing_companies.values() {
|
|
let line = serde_json::to_string(company)?;
|
|
checkpoint_file.write_all(line.as_bytes()).await?;
|
|
checkpoint_file.write_all(b"\n").await?;
|
|
}
|
|
|
|
checkpoint_file.flush().await?;
|
|
checkpoint_file.sync_all().await?;
|
|
drop(checkpoint_file);
|
|
|
|
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
|
|
|
drop(log_file);
|
|
tokio::fs::remove_file(&log_path).await.ok();
|
|
|
|
logger::log_info("✓ Final checkpoint created").await;
|
|
}
|
|
|
|
logger::log_info(&format!(
|
|
"Completed: {} total companies ({} new, {} updated)",
|
|
count, new_count, updated_count
|
|
)).await;
|
|
|
|
Ok(count)
|
|
}
|
|
|
|
/// UPDATED: Process single company serially with validation
|
|
async fn process_single_company_serial(
|
|
name: String,
|
|
company_info: CompanyInfo,
|
|
existing_entry: Option<CompanyCrossPlatformInfo>,
|
|
pool: &Arc<ChromeDriverPool>,
|
|
shutdown_flag: &Arc<AtomicBool>,
|
|
) -> anyhow::Result<Option<CompanyCrossPlatformInfo>> {
|
|
// Check shutdown at start
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
return Ok(None);
|
|
}
|
|
|
|
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
|
existing_entry
|
|
.as_ref()
|
|
.map(|e| e.isin_tickers_map.clone())
|
|
.unwrap_or_default();
|
|
|
|
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
|
|
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
|
|
|
|
// Collect unique ISIN-ticker pairs
|
|
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
|
|
|
for figi_infos in company_info.securities.values() {
|
|
for figi_info in figi_infos {
|
|
if !figi_info.isin.is_empty() {
|
|
let tickers = unique_isin_ticker_pairs
|
|
.entry(figi_info.isin.clone())
|
|
.or_insert_with(Vec::new);
|
|
|
|
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
|
tickers.push(figi_info.ticker.clone());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Process each ISIN with validation
|
|
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
|
// Check shutdown before each ISIN
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
return Ok(None);
|
|
}
|
|
|
|
let tickers = isin_tickers_map
|
|
.entry(isin.clone())
|
|
.or_insert_with(Vec::new);
|
|
|
|
for figi_ticker in figi_tickers {
|
|
if !tickers.contains(&figi_ticker) {
|
|
tickers.push(figi_ticker);
|
|
}
|
|
}
|
|
|
|
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
|
|
|
if !has_yahoo_ticker {
|
|
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
|
|
|
// Use validated scraping with retry
|
|
match scrape_with_retry_serial(pool, &isin, 3, shutdown_flag).await {
|
|
Ok(Some(details)) => {
|
|
logger::log_info(&format!(
|
|
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
|
|
details.ticker, isin, name
|
|
)).await;
|
|
|
|
tickers.push(format!("YAHOO:{}", details.ticker));
|
|
|
|
if sector.is_none() && details.sector.is_some() {
|
|
sector = details.sector.clone();
|
|
}
|
|
|
|
if exchange.is_none() && details.exchange.is_some() {
|
|
exchange = details.exchange.clone();
|
|
}
|
|
},
|
|
Ok(None) => {
|
|
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
|
|
tickers.push("YAHOO:NO_RESULTS".to_string());
|
|
},
|
|
Err(e) => {
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
return Ok(None);
|
|
}
|
|
logger::log_warn(&format!(
|
|
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
|
|
isin, name, e
|
|
)).await;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Final shutdown check
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
return Ok(None);
|
|
}
|
|
|
|
if !isin_tickers_map.is_empty() {
|
|
Ok(Some(CompanyCrossPlatformInfo {
|
|
name,
|
|
isin_tickers_map,
|
|
sector,
|
|
exchange,
|
|
}))
|
|
} else {
|
|
Ok(None)
|
|
}
|
|
}
|
|
|
|
/// UPDATED: Scrape with retry for serial processing
|
|
async fn scrape_with_retry_serial(
|
|
pool: &Arc<ChromeDriverPool>,
|
|
isin: &str,
|
|
max_retries: u32,
|
|
shutdown_flag: &Arc<AtomicBool>,
|
|
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
|
let mut retries = 0;
|
|
|
|
loop {
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
return Err(anyhow::anyhow!("Aborted due to shutdown"));
|
|
}
|
|
|
|
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
|
|
Ok(result) => return Ok(result),
|
|
Err(e) => {
|
|
if retries >= max_retries {
|
|
return Err(e);
|
|
}
|
|
|
|
let backoff_ms = 1000 * 2u64.pow(retries);
|
|
let jitter_ms = random_range(0, 500);
|
|
let total_delay = backoff_ms + jitter_ms;
|
|
|
|
logger::log_warn(&format!(
|
|
"Retry {}/{} for ISIN {} after {}ms: {}",
|
|
retries + 1, max_retries, isin, total_delay, e
|
|
)).await;
|
|
|
|
tokio::time::sleep(tokio::time::Duration::from_millis(total_delay)).await;
|
|
retries += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
|
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
|
|
|
if !map_cache_dir.exists() {
|
|
return Ok(None);
|
|
}
|
|
|
|
let mut entries = tokio::fs::read_dir(&map_cache_dir).await?;
|
|
let mut dates = Vec::new();
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
let path = entry.path();
|
|
if path.is_dir() {
|
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
|
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
|
|
dates.push((name.to_string(), path));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if dates.is_empty() {
|
|
return Ok(None);
|
|
}
|
|
|
|
dates.sort_by(|a, b| b.0.cmp(&a.0));
|
|
Ok(Some(dates[0].1.clone()))
|
|
}
|
|
|
|
pub struct ProcessResult {
|
|
pub changes: Vec<CompanyEventChange>,
|
|
}
|
|
|
|
pub fn process_batch(
|
|
new_events: &[CompanyEvent],
|
|
existing: &mut HashMap<String, CompanyEvent>,
|
|
today: &str,
|
|
) -> ProcessResult {
|
|
let mut changes = Vec::new();
|
|
|
|
for new in new_events {
|
|
let key = event_key(new);
|
|
|
|
if let Some(old) = existing.get(&key) {
|
|
changes.extend(detect_changes(old, new, today));
|
|
existing.insert(key, new.clone());
|
|
continue;
|
|
}
|
|
|
|
let date_key = format!("{}|{}", new.ticker, new.date);
|
|
let mut found_old = None;
|
|
for (k, e) in existing.iter() {
|
|
if format!("{}|{}", e.ticker, e.date) == date_key && k != &key {
|
|
found_old = Some((k.clone(), e.clone()));
|
|
break;
|
|
}
|
|
}
|
|
|
|
if let Some((old_key, old_event)) = found_old {
|
|
if new.date.as_str() > today {
|
|
changes.push(CompanyEventChange {
|
|
ticker: new.ticker.clone(),
|
|
date: new.date.clone(),
|
|
field_changed: "time".to_string(),
|
|
old_value: old_event.time.clone(),
|
|
new_value: new.time.clone(),
|
|
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
|
});
|
|
}
|
|
existing.remove(&old_key);
|
|
}
|
|
|
|
existing.insert(key, new.clone());
|
|
}
|
|
|
|
ProcessResult { changes }
|
|
} |