removed serial data scraping for yahoo tickers
This commit is contained in:
@@ -93,423 +93,6 @@ pub async fn run_full_update(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// UPDATED: Serial version with validation (kept for compatibility/debugging)
|
||||
///
|
||||
/// This is the non-parallel version that processes companies sequentially.
|
||||
/// Updated with same validation and shutdown checks as parallel version.
|
||||
///
|
||||
/// Use this for:
|
||||
/// - Debugging issues with specific companies
|
||||
/// - Environments where parallel processing isn't desired
|
||||
/// - Testing validation logic without concurrency complexity
|
||||
async fn build_companies_jsonl_streaming_serial(
|
||||
paths: &DataPaths,
|
||||
pool: &Arc<ChromeDriverPool>,
|
||||
shutdown_flag: &Arc<AtomicBool>,
|
||||
) -> anyhow::Result<usize> {
|
||||
// Configuration constants
|
||||
const CHECKPOINT_INTERVAL: usize = 50;
|
||||
const FSYNC_BATCH_SIZE: usize = 10;
|
||||
const FSYNC_INTERVAL_SECS: u64 = 10;
|
||||
|
||||
let path = DataPaths::new(".")?;
|
||||
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
||||
let securities_path = corporate_path.join("common_stocks.json");
|
||||
|
||||
if !securities_path.exists() {
|
||||
logger::log_warn("No common_stocks.json found").await;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let content = tokio::fs::read_to_string(securities_path).await?;
|
||||
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
|
||||
|
||||
let companies_path = paths.data_dir().join("companies.jsonl");
|
||||
let log_path = paths.data_dir().join("companies_updates.log");
|
||||
|
||||
if let Some(parent) = companies_path.parent() {
|
||||
tokio::fs::create_dir_all(parent).await?;
|
||||
}
|
||||
|
||||
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||
|
||||
if companies_path.exists() {
|
||||
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
||||
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
||||
|
||||
for line in existing_content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||
Ok(company) => {
|
||||
processed_names.insert(company.name.clone());
|
||||
existing_companies.insert(company.name.clone(), company);
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
||||
}
|
||||
|
||||
if log_path.exists() {
|
||||
logger::log_info("Replaying update log...").await;
|
||||
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||
let mut replayed = 0;
|
||||
|
||||
for line in log_content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||
Ok(company) => {
|
||||
processed_names.insert(company.name.clone());
|
||||
existing_companies.insert(company.name.clone(), company);
|
||||
replayed += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
if replayed > 0 {
|
||||
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// === OPEN LOG FILE ===
|
||||
use tokio::fs::OpenOptions;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
let mut log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_path)
|
||||
.await?;
|
||||
|
||||
let mut writes_since_fsync = 0;
|
||||
let mut last_fsync = std::time::Instant::now();
|
||||
let mut updates_since_checkpoint = 0;
|
||||
let mut count = 0;
|
||||
let mut new_count = 0;
|
||||
let mut updated_count = 0;
|
||||
|
||||
logger::log_info(&format!("Processing {} companies sequentially...", securities.len())).await;
|
||||
|
||||
// === PROCESS COMPANIES SEQUENTIALLY ===
|
||||
for (name, company_info) in securities.clone() {
|
||||
// Check shutdown before each company
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
logger::log_warn(&format!(
|
||||
"Shutdown detected at company: {} (progress: {}/{})",
|
||||
name, count, count + securities.len()
|
||||
)).await;
|
||||
break;
|
||||
}
|
||||
|
||||
let existing_entry = existing_companies.get(&name).cloned();
|
||||
let is_update = existing_entry.is_some();
|
||||
|
||||
// Process company with validation
|
||||
match process_single_company_serial(
|
||||
name.clone(),
|
||||
company_info,
|
||||
existing_entry,
|
||||
pool,
|
||||
shutdown_flag,
|
||||
).await {
|
||||
Ok(Some(company_entry)) => {
|
||||
// Write to log
|
||||
let line = serde_json::to_string(&company_entry)?;
|
||||
log_file.write_all(line.as_bytes()).await?;
|
||||
log_file.write_all(b"\n").await?;
|
||||
|
||||
writes_since_fsync += 1;
|
||||
|
||||
// Batched + time-based fsync
|
||||
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|
||||
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
|
||||
|
||||
if should_fsync {
|
||||
log_file.flush().await?;
|
||||
log_file.sync_data().await?;
|
||||
writes_since_fsync = 0;
|
||||
last_fsync = std::time::Instant::now();
|
||||
}
|
||||
|
||||
// Update in-memory state
|
||||
processed_names.insert(name.clone());
|
||||
existing_companies.insert(name.clone(), company_entry);
|
||||
|
||||
count += 1;
|
||||
updates_since_checkpoint += 1;
|
||||
|
||||
if is_update {
|
||||
updated_count += 1;
|
||||
} else {
|
||||
new_count += 1;
|
||||
}
|
||||
|
||||
// Periodic checkpoint
|
||||
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
|
||||
if writes_since_fsync > 0 {
|
||||
log_file.flush().await?;
|
||||
log_file.sync_data().await?;
|
||||
writes_since_fsync = 0;
|
||||
last_fsync = std::time::Instant::now();
|
||||
}
|
||||
|
||||
logger::log_info(&format!("Creating checkpoint at {} companies...", count)).await;
|
||||
|
||||
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
||||
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
||||
|
||||
for company in existing_companies.values() {
|
||||
let line = serde_json::to_string(company)?;
|
||||
checkpoint_file.write_all(line.as_bytes()).await?;
|
||||
checkpoint_file.write_all(b"\n").await?;
|
||||
}
|
||||
|
||||
checkpoint_file.flush().await?;
|
||||
checkpoint_file.sync_all().await?;
|
||||
drop(checkpoint_file);
|
||||
|
||||
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
||||
|
||||
drop(log_file);
|
||||
tokio::fs::remove_file(&log_path).await.ok();
|
||||
log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_path)
|
||||
.await?;
|
||||
|
||||
updates_since_checkpoint = 0;
|
||||
logger::log_info("✓ Checkpoint created and log cleared").await;
|
||||
}
|
||||
|
||||
if count % 10 == 0 {
|
||||
logger::log_info(&format!(
|
||||
"Progress: {} companies ({} new, {} updated)",
|
||||
count, new_count, updated_count
|
||||
)).await;
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
// Company had no ISINs or was skipped
|
||||
logger::log_info(&format!("Skipped company: {} (no ISINs)", name)).await;
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_warn(&format!("Error processing company {}: {}", name, e)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// Time-based fsync
|
||||
if writes_since_fsync > 0 && last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS {
|
||||
log_file.flush().await?;
|
||||
log_file.sync_data().await?;
|
||||
writes_since_fsync = 0;
|
||||
last_fsync = std::time::Instant::now();
|
||||
}
|
||||
}
|
||||
|
||||
// === FSYNC PENDING WRITES ===
|
||||
if writes_since_fsync > 0 {
|
||||
logger::log_info(&format!("Fsyncing {} pending writes...", writes_since_fsync)).await;
|
||||
log_file.flush().await?;
|
||||
log_file.sync_data().await?;
|
||||
logger::log_info("✓ Pending writes saved").await;
|
||||
}
|
||||
|
||||
// === FINAL CHECKPOINT ===
|
||||
if !shutdown_flag.load(Ordering::SeqCst) && updates_since_checkpoint > 0 {
|
||||
logger::log_info("Creating final checkpoint...").await;
|
||||
|
||||
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
||||
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
||||
|
||||
for company in existing_companies.values() {
|
||||
let line = serde_json::to_string(company)?;
|
||||
checkpoint_file.write_all(line.as_bytes()).await?;
|
||||
checkpoint_file.write_all(b"\n").await?;
|
||||
}
|
||||
|
||||
checkpoint_file.flush().await?;
|
||||
checkpoint_file.sync_all().await?;
|
||||
drop(checkpoint_file);
|
||||
|
||||
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
||||
|
||||
drop(log_file);
|
||||
tokio::fs::remove_file(&log_path).await.ok();
|
||||
|
||||
logger::log_info("✓ Final checkpoint created").await;
|
||||
}
|
||||
|
||||
logger::log_info(&format!(
|
||||
"Completed: {} total companies ({} new, {} updated)",
|
||||
count, new_count, updated_count
|
||||
)).await;
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
/// UPDATED: Process single company serially with validation
|
||||
async fn process_single_company_serial(
|
||||
name: String,
|
||||
company_info: CompanyInfo,
|
||||
existing_entry: Option<CompanyCrossPlatformInfo>,
|
||||
pool: &Arc<ChromeDriverPool>,
|
||||
shutdown_flag: &Arc<AtomicBool>,
|
||||
) -> anyhow::Result<Option<CompanyCrossPlatformInfo>> {
|
||||
// Check shutdown at start
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
||||
existing_entry
|
||||
.as_ref()
|
||||
.map(|e| e.isin_tickers_map.clone())
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
|
||||
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
|
||||
|
||||
// Collect unique ISIN-ticker pairs
|
||||
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
||||
|
||||
for figi_infos in company_info.securities.values() {
|
||||
for figi_info in figi_infos {
|
||||
if !figi_info.isin.is_empty() {
|
||||
let tickers = unique_isin_ticker_pairs
|
||||
.entry(figi_info.isin.clone())
|
||||
.or_insert_with(Vec::new);
|
||||
|
||||
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
||||
tickers.push(figi_info.ticker.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process each ISIN with validation
|
||||
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
||||
// Check shutdown before each ISIN
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tickers = isin_tickers_map
|
||||
.entry(isin.clone())
|
||||
.or_insert_with(Vec::new);
|
||||
|
||||
for figi_ticker in figi_tickers {
|
||||
if !tickers.contains(&figi_ticker) {
|
||||
tickers.push(figi_ticker);
|
||||
}
|
||||
}
|
||||
|
||||
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
||||
|
||||
if !has_yahoo_ticker {
|
||||
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
||||
|
||||
// Use validated scraping with retry
|
||||
match scrape_with_retry_serial(pool, &isin, 3, shutdown_flag).await {
|
||||
Ok(Some(details)) => {
|
||||
logger::log_info(&format!(
|
||||
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
|
||||
details.ticker, isin, name
|
||||
)).await;
|
||||
|
||||
tickers.push(format!("YAHOO:{}", details.ticker));
|
||||
|
||||
if sector.is_none() && details.sector.is_some() {
|
||||
sector = details.sector.clone();
|
||||
}
|
||||
|
||||
if exchange.is_none() && details.exchange.is_some() {
|
||||
exchange = details.exchange.clone();
|
||||
}
|
||||
},
|
||||
Ok(None) => {
|
||||
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
|
||||
tickers.push("YAHOO:NO_RESULTS".to_string());
|
||||
},
|
||||
Err(e) => {
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
return Ok(None);
|
||||
}
|
||||
logger::log_warn(&format!(
|
||||
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
|
||||
isin, name, e
|
||||
)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final shutdown check
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if !isin_tickers_map.is_empty() {
|
||||
Ok(Some(CompanyCrossPlatformInfo {
|
||||
name,
|
||||
isin_tickers_map,
|
||||
sector,
|
||||
exchange,
|
||||
}))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// UPDATED: Scrape with retry for serial processing
|
||||
async fn scrape_with_retry_serial(
|
||||
pool: &Arc<ChromeDriverPool>,
|
||||
isin: &str,
|
||||
max_retries: u32,
|
||||
shutdown_flag: &Arc<AtomicBool>,
|
||||
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
||||
let mut retries = 0;
|
||||
|
||||
loop {
|
||||
if shutdown_flag.load(Ordering::SeqCst) {
|
||||
return Err(anyhow::anyhow!("Aborted due to shutdown"));
|
||||
}
|
||||
|
||||
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => {
|
||||
if retries >= max_retries {
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
let backoff_ms = 1000 * 2u64.pow(retries);
|
||||
let jitter_ms = random_range(0, 500);
|
||||
let total_delay = backoff_ms + jitter_ms;
|
||||
|
||||
logger::log_warn(&format!(
|
||||
"Retry {}/{} for ISIN {} after {}ms: {}",
|
||||
retries + 1, max_retries, isin, total_delay, e
|
||||
)).await;
|
||||
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(total_delay)).await;
|
||||
retries += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
||||
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
use crate::scraper::webdriver::ChromeDriverPool;
|
||||
|
||||
use rand::Rng;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::fs::OpenOptions;
|
||||
@@ -22,7 +21,7 @@ use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::Duration;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use anyhow::{anyhow, Result};
|
||||
|
||||
/// Represents a write command to be serialized through the log writer
|
||||
enum LogCommand {
|
||||
|
||||
Reference in New Issue
Block a user