557 lines
22 KiB
Rust
557 lines
22 KiB
Rust
// src/corporate/update_parallel.rs
|
|
// PARALLELIZED VERSION of build_companies_jsonl_streaming
|
|
//
|
|
// Key improvements:
|
|
// - Processes multiple companies concurrently using the ChromeDriverPool
|
|
// - Maintains data safety with serialized log writes via channel
|
|
// - Respects pool size limits via semaphore
|
|
// - All fsync and checkpoint logic preserved
|
|
|
|
use super::{types::*, yahoo::*};
|
|
use crate::util::directories::DataPaths;
|
|
use crate::util::logger;
|
|
use crate::scraper::webdriver::ChromeDriverPool;
|
|
|
|
use rand::Rng;
|
|
use tokio::sync::mpsc;
|
|
use tokio::io::AsyncWriteExt;
|
|
use tokio::fs::OpenOptions;
|
|
use tokio::time::sleep;
|
|
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::{AtomicBool, Ordering};
|
|
use std::time::Duration;
|
|
use futures::stream::{FuturesUnordered, StreamExt};
|
|
use anyhow::{anyhow, Context, Result};
|
|
|
|
|
|
/// Represents a write command to be serialized through the log writer
|
|
enum LogCommand {
|
|
Write(CompanyCrossPlatformInfo),
|
|
Checkpoint,
|
|
Shutdown,
|
|
}
|
|
|
|
/// Result from processing a single company
|
|
struct CompanyProcessResult {
|
|
company: CompanyCrossPlatformInfo,
|
|
is_update: bool,
|
|
}
|
|
|
|
/// Abort-safe incremental JSONL persistence with atomic checkpoints (PARALLELIZED)
|
|
///
|
|
/// Implements the data_updating_rule.md specification with concurrent processing:
|
|
/// - Append-only JSONL log for all updates
|
|
/// - Batched fsync for performance (configurable batch size)
|
|
/// - Time-based fsync for safety (max 10 seconds without fsync)
|
|
/// - Atomic checkpoints via temp file + rename
|
|
/// - Crash recovery by loading checkpoint + replaying log
|
|
/// - Partial lines automatically ignored by .lines() iterator
|
|
/// - PARALLEL processing of companies using ChromeDriverPool
|
|
/// - Serialized log writes for data safety
|
|
///
|
|
/// # Parallelization Strategy
|
|
///
|
|
/// - Multiple companies processed concurrently (limited by pool size)
|
|
/// - Each company's Yahoo lookups happen in parallel
|
|
/// - Log writes are serialized through a channel
|
|
/// - Pool's semaphore naturally limits concurrency
|
|
/// - All fsync and checkpoint logic preserved
|
|
pub async fn build_companies_jsonl_streaming_parallel(
|
|
paths: &DataPaths,
|
|
pool: &Arc<ChromeDriverPool>,
|
|
shutdown_flag: &Arc<AtomicBool>,
|
|
) -> anyhow::Result<usize> {
|
|
// Configuration constants
|
|
const CHECKPOINT_INTERVAL: usize = 50;
|
|
const FSYNC_BATCH_SIZE: usize = 10;
|
|
const FSYNC_INTERVAL_SECS: u64 = 10;
|
|
const CONCURRENCY_LIMIT: usize = 100; // Max companies processing at once
|
|
|
|
let path = DataPaths::new(".")?;
|
|
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
|
let securities_path = corporate_path.join("common_stocks.json");
|
|
|
|
if !securities_path.exists() {
|
|
logger::log_warn("No common_stocks.json found").await;
|
|
return Ok(0);
|
|
}
|
|
|
|
let content = tokio::fs::read_to_string(securities_path).await?;
|
|
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
|
|
|
|
let companies_path = paths.data_dir().join("companies.jsonl");
|
|
let log_path = paths.data_dir().join("companies_updates.log");
|
|
|
|
if let Some(parent) = companies_path.parent() {
|
|
tokio::fs::create_dir_all(parent).await?;
|
|
}
|
|
|
|
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
|
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
|
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
|
|
|
if companies_path.exists() {
|
|
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
|
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
|
|
|
for line in existing_content.lines() {
|
|
if line.trim().is_empty() {
|
|
continue;
|
|
}
|
|
|
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
|
Ok(company) => {
|
|
processed_names.insert(company.name.clone());
|
|
existing_companies.insert(company.name.clone(), company);
|
|
}
|
|
Err(e) => {
|
|
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
|
}
|
|
}
|
|
}
|
|
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
|
}
|
|
|
|
if log_path.exists() {
|
|
logger::log_info("Replaying update log...").await;
|
|
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
|
let mut replayed = 0;
|
|
|
|
for line in log_content.lines() {
|
|
if line.trim().is_empty() {
|
|
continue;
|
|
}
|
|
|
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
|
Ok(company) => {
|
|
processed_names.insert(company.name.clone());
|
|
existing_companies.insert(company.name.clone(), company);
|
|
replayed += 1;
|
|
}
|
|
Err(e) => {
|
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
|
}
|
|
}
|
|
}
|
|
if replayed > 0 {
|
|
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
|
}
|
|
}
|
|
|
|
// === SETUP LOG WRITER TASK ===
|
|
// This task serializes all log writes to maintain data safety
|
|
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
|
|
|
|
let log_file_init = OpenOptions::new()
|
|
.create(true)
|
|
.append(true)
|
|
.open(&log_path)
|
|
.await?;
|
|
|
|
let companies_path_clone = companies_path.clone();
|
|
let log_path_clone = log_path.clone();
|
|
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
|
|
|
|
let writer_task = tokio::spawn(async move {
|
|
let mut log_file = log_file_init; // Move into the task
|
|
let mut writes_since_fsync = 0;
|
|
let mut last_fsync = std::time::Instant::now();
|
|
let mut updates_since_checkpoint = 0;
|
|
let mut count = 0;
|
|
let mut new_count = 0;
|
|
let mut updated_count = 0;
|
|
|
|
while let Some(cmd) = write_rx.recv().await {
|
|
match cmd {
|
|
LogCommand::Write(company) => {
|
|
// Write to log
|
|
let line = serde_json::to_string(&company).unwrap();
|
|
if let Err(e) = log_file.write_all(line.as_bytes()).await {
|
|
logger::log_error(&format!("Failed to write to log: {}", e)).await;
|
|
break;
|
|
}
|
|
if let Err(e) = log_file.write_all(b"\n").await {
|
|
logger::log_error(&format!("Failed to write newline: {}", e)).await;
|
|
break;
|
|
}
|
|
|
|
writes_since_fsync += 1;
|
|
updates_since_checkpoint += 1;
|
|
count += 1;
|
|
|
|
// Update in-memory state
|
|
let mut existing_companies = existing_companies_writer.lock().await;
|
|
let is_update = existing_companies.contains_key(&company.name);
|
|
existing_companies.insert(company.name.clone(), company);
|
|
drop(existing_companies);
|
|
|
|
if is_update {
|
|
updated_count += 1;
|
|
} else {
|
|
new_count += 1;
|
|
}
|
|
|
|
// Batched + time-based fsync
|
|
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|
|
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
|
|
|
|
if should_fsync {
|
|
if let Err(e) = log_file.flush().await {
|
|
logger::log_error(&format!("Failed to flush: {}", e)).await;
|
|
break;
|
|
}
|
|
if let Err(e) = log_file.sync_data().await {
|
|
logger::log_error(&format!("Failed to fsync: {}", e)).await;
|
|
break;
|
|
}
|
|
writes_since_fsync = 0;
|
|
last_fsync = std::time::Instant::now();
|
|
}
|
|
|
|
// Periodic checkpoint
|
|
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
|
|
// Fsync pending writes before checkpoint
|
|
if writes_since_fsync > 0 {
|
|
let _ = log_file.flush().await;
|
|
let _ = log_file.sync_data().await;
|
|
writes_since_fsync = 0;
|
|
last_fsync = std::time::Instant::now();
|
|
}
|
|
|
|
logger::log_info(&format!("Creating checkpoint at {} companies...", count)).await;
|
|
|
|
let checkpoint_tmp = companies_path_clone.with_extension("jsonl.tmp");
|
|
let mut checkpoint_file = match tokio::fs::File::create(&checkpoint_tmp).await {
|
|
Ok(f) => f,
|
|
Err(e) => {
|
|
logger::log_error(&format!("Failed to create checkpoint: {}", e)).await;
|
|
break;
|
|
}
|
|
};
|
|
|
|
let existing_companies = existing_companies_writer.lock().await;
|
|
for company in existing_companies.values() {
|
|
let line = serde_json::to_string(company).unwrap();
|
|
let _ = checkpoint_file.write_all(line.as_bytes()).await;
|
|
let _ = checkpoint_file.write_all(b"\n").await;
|
|
}
|
|
drop(existing_companies);
|
|
|
|
let _ = checkpoint_file.flush().await;
|
|
let _ = checkpoint_file.sync_all().await;
|
|
drop(checkpoint_file);
|
|
|
|
let _ = tokio::fs::rename(&checkpoint_tmp, &companies_path_clone).await;
|
|
|
|
// Clear log and reopen
|
|
drop(log_file);
|
|
let _ = tokio::fs::remove_file(&log_path_clone).await;
|
|
|
|
// Reopen log file
|
|
match OpenOptions::new()
|
|
.create(true)
|
|
.append(true)
|
|
.open(&log_path_clone)
|
|
.await {
|
|
Ok(new_file) => {
|
|
log_file = new_file;
|
|
updates_since_checkpoint = 0;
|
|
logger::log_info("✓ Checkpoint created and log cleared").await;
|
|
}
|
|
Err(e) => {
|
|
logger::log_error(&format!("Failed to reopen log: {}", e)).await;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if count % 10 == 0 {
|
|
logger::log_info(&format!("Progress: {} companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
|
}
|
|
},
|
|
LogCommand::Checkpoint => {
|
|
// Force checkpoint - this is the final checkpoint before shutdown
|
|
if writes_since_fsync > 0 {
|
|
let _ = log_file.flush().await;
|
|
let _ = log_file.sync_data().await;
|
|
}
|
|
|
|
logger::log_info("Creating final checkpoint...").await;
|
|
let checkpoint_tmp = companies_path_clone.with_extension("jsonl.tmp");
|
|
if let Ok(mut checkpoint_file) = tokio::fs::File::create(&checkpoint_tmp).await {
|
|
let existing_companies = existing_companies_writer.lock().await;
|
|
for company in existing_companies.values() {
|
|
let line = serde_json::to_string(company).unwrap();
|
|
let _ = checkpoint_file.write_all(line.as_bytes()).await;
|
|
let _ = checkpoint_file.write_all(b"\n").await;
|
|
}
|
|
drop(existing_companies);
|
|
|
|
let _ = checkpoint_file.flush().await;
|
|
let _ = checkpoint_file.sync_all().await;
|
|
drop(checkpoint_file);
|
|
let _ = tokio::fs::rename(&checkpoint_tmp, &companies_path_clone).await;
|
|
|
|
// Clean up log file after final checkpoint
|
|
drop(log_file);
|
|
let _ = tokio::fs::remove_file(&log_path_clone).await;
|
|
|
|
logger::log_info("✓ Final checkpoint created").await;
|
|
}
|
|
// After final checkpoint, exit the loop
|
|
break;
|
|
},
|
|
LogCommand::Shutdown => {
|
|
// Fsync any pending writes before exit
|
|
if writes_since_fsync > 0 {
|
|
logger::log_info(&format!("Fsyncing {} pending writes...", writes_since_fsync)).await;
|
|
let _ = log_file.flush().await;
|
|
let _ = log_file.sync_data().await;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
(count, new_count, updated_count)
|
|
});
|
|
|
|
// === PARALLEL COMPANY PROCESSING ===
|
|
logger::log_info(&format!("Processing companies in parallel (max {} concurrent, pool size: {})",
|
|
CONCURRENCY_LIMIT, pool.get_number_of_instances())).await;
|
|
|
|
let pool = pool.clone();
|
|
let shutdown_flag = shutdown_flag.clone();
|
|
|
|
let mut processing_tasks = FuturesUnordered::new();
|
|
let mut pending_companies = Vec::new();
|
|
|
|
// Collect companies to process
|
|
for (name, company_info) in securities.iter() {
|
|
if processed_names.contains(name) {
|
|
continue;
|
|
}
|
|
pending_companies.push((name.clone(), company_info.clone()));
|
|
}
|
|
|
|
logger::log_info(&format!("Found {} companies to process", pending_companies.len())).await;
|
|
|
|
// Process companies in chunks to limit memory usage
|
|
let chunk_size = CONCURRENCY_LIMIT;
|
|
let mut processed = 0;
|
|
|
|
for chunk in pending_companies.chunks(chunk_size) {
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
break;
|
|
}
|
|
|
|
// Launch tasks for this chunk
|
|
for (name, company_info) in chunk {
|
|
let name = name.clone();
|
|
let company_info = company_info.clone();
|
|
let pool = pool.clone();
|
|
let shutdown_flag = shutdown_flag.clone();
|
|
let existing_entry = existing_companies.get(&name).cloned();
|
|
|
|
let task = tokio::spawn(async move {
|
|
process_single_company(
|
|
name,
|
|
company_info,
|
|
existing_entry,
|
|
&pool,
|
|
&shutdown_flag
|
|
).await
|
|
});
|
|
|
|
processing_tasks.push(task);
|
|
}
|
|
|
|
// Wait for chunk to complete
|
|
while let Some(result) = processing_tasks.next().await {
|
|
match result {
|
|
Ok(Ok(Some(company_result))) => {
|
|
// Send to writer
|
|
if write_tx.send(LogCommand::Write(company_result.company)).await.is_err() {
|
|
logger::log_error("Writer task died, stopping processing").await;
|
|
break;
|
|
}
|
|
processed += 1;
|
|
}
|
|
Ok(Ok(None)) => {
|
|
// Company had no ISINs or was skipped
|
|
processed += 1;
|
|
}
|
|
Ok(Err(e)) => {
|
|
logger::log_warn(&format!("Company processing error: {}", e)).await;
|
|
processed += 1;
|
|
}
|
|
Err(e) => {
|
|
logger::log_error(&format!("Task panic: {}", e)).await;
|
|
processed += 1;
|
|
}
|
|
}
|
|
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Signal writer to finish
|
|
let _ = write_tx.send(LogCommand::Shutdown).await;
|
|
drop(write_tx);
|
|
|
|
// Wait for writer to finish
|
|
let (final_count, final_new, final_updated) = writer_task.await
|
|
.unwrap_or((0, 0, 0));
|
|
|
|
logger::log_info(&format!(
|
|
"Completed: {} total companies ({} new, {} updated)",
|
|
final_count, final_new, final_updated
|
|
)).await;
|
|
|
|
Ok(final_count)
|
|
}
|
|
|
|
async fn scrape_with_retry(
|
|
pool: &Arc<ChromeDriverPool>,
|
|
isin: &str,
|
|
max_retries: u32,
|
|
) -> Result<Option<YahooCompanyDetails>> {
|
|
let mut retries = 0;
|
|
|
|
loop {
|
|
match scrape_company_details_by_isin(pool, isin).await {
|
|
Ok(result) => return Ok(result),
|
|
Err(e) => {
|
|
if retries >= max_retries {
|
|
return Err(e);
|
|
}
|
|
|
|
let backoff_ms = 1000 * 2u64.pow(retries); // 1s, 2s, 4s, 8s
|
|
let jitter_ms = rand::rng().random_range(0..500); // +0-500ms Jitter
|
|
let total_delay = backoff_ms + jitter_ms;
|
|
|
|
logger::log_warn(&format!(
|
|
"Retry {}/{} for ISIN {} after {}ms: {}",
|
|
retries + 1, max_retries, isin, total_delay, e
|
|
)).await;
|
|
|
|
sleep(Duration::from_millis(total_delay)).await;
|
|
retries += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Process a single company: fetch Yahoo data for its ISINs
|
|
async fn process_single_company(
|
|
name: String,
|
|
company_info: CompanyInfo,
|
|
existing_entry: Option<CompanyCrossPlatformInfo>,
|
|
pool: &Arc<ChromeDriverPool>,
|
|
shutdown_flag: &Arc<AtomicBool>,
|
|
) -> anyhow::Result<Option<CompanyProcessResult>> {
|
|
let is_update = existing_entry.is_some();
|
|
|
|
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
|
existing_entry
|
|
.as_ref()
|
|
.map(|e| e.isin_tickers_map.clone())
|
|
.unwrap_or_default();
|
|
|
|
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
|
|
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
|
|
|
|
// Collect unique ISIN-ticker pairs
|
|
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
|
|
|
for figi_infos in company_info.securities.values() {
|
|
for figi_info in figi_infos {
|
|
if !figi_info.isin.is_empty() {
|
|
let tickers = unique_isin_ticker_pairs
|
|
.entry(figi_info.isin.clone())
|
|
.or_insert_with(Vec::new);
|
|
|
|
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
|
tickers.push(figi_info.ticker.clone());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Process each ISIN (these Yahoo lookups will happen in parallel across companies)
|
|
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
break;
|
|
}
|
|
|
|
let tickers = isin_tickers_map
|
|
.entry(isin.clone())
|
|
.or_insert_with(Vec::new);
|
|
|
|
for figi_ticker in figi_tickers {
|
|
if !tickers.contains(&figi_ticker) {
|
|
tickers.push(figi_ticker);
|
|
}
|
|
}
|
|
|
|
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
|
|
|
if !has_yahoo_ticker && !shutdown_flag.load(Ordering::SeqCst) {
|
|
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
|
match scrape_with_retry(pool, &isin, 3).await {
|
|
Ok(Some(details)) => {
|
|
logger::log_info(&format!("✓ Found Yahoo ticker {} for ISIN {}", details.ticker, isin)).await;
|
|
|
|
tickers.push(format!("YAHOO:{}", details.ticker));
|
|
|
|
if sector.is_none() && details.sector.is_some() {
|
|
sector = details.sector.clone();
|
|
logger::log_info(&format!(" Sector: {}", details.sector.as_ref().unwrap())).await;
|
|
}
|
|
|
|
if exchange.is_none() && details.exchange.is_some() {
|
|
exchange = details.exchange.clone();
|
|
logger::log_info(&format!(" Exchange: {}", details.exchange.as_ref().unwrap())).await;
|
|
}
|
|
},
|
|
Ok(None) => {
|
|
logger::log_warn(&format!("◯ No search results for ISIN {}", isin)).await;
|
|
tickers.push("YAHOO:NO_RESULTS".to_string());
|
|
},
|
|
Err(e) => {
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
break;
|
|
}
|
|
logger::log_warn(&format!("✗ Yahoo lookup error for ISIN {}: {}", isin, e)).await;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if shutdown_flag.load(Ordering::SeqCst) {
|
|
return Ok(None);
|
|
}
|
|
|
|
if !isin_tickers_map.is_empty() {
|
|
let company_entry = CompanyCrossPlatformInfo {
|
|
name: name.clone(),
|
|
isin_tickers_map,
|
|
sector,
|
|
exchange,
|
|
};
|
|
|
|
Ok(Some(CompanyProcessResult {
|
|
company: company_entry,
|
|
is_update,
|
|
}))
|
|
} else {
|
|
Ok(None)
|
|
}
|
|
} |