Compare commits

..

33 Commits

Author SHA1 Message Date
eff1412c0f removed claudes md 2026-01-15 00:23:29 +01:00
75ab1969c7 added cross compatiblity between shutdown flag and state entries 2026-01-15 00:22:55 +01:00
f4b20f824d removed crossplatformcompany from types 2026-01-14 14:49:00 +01:00
93fbefc9d4 removed id creation on scrape 2026-01-14 14:28:16 +01:00
4ea0c78d3d added ids for companies 2026-01-12 23:03:01 +01:00
1d025a04ce updated securities directory 2026-01-12 22:23:34 +01:00
98e1bca12f moved helper functions into helpers.rs 2026-01-12 22:06:13 +01:00
29d8f1d89e moved structs to types.rs 2026-01-12 18:50:44 +01:00
c0c9bc0ed9 added bond extraction from figi 2026-01-12 15:58:06 +01:00
659757482d öi 2026-01-12 01:01:19 +01:00
bd74f36f4c added integrity dir for set data collection; one state.jsonl 2026-01-11 16:57:36 +01:00
e6f8393660 merged enriching functions into one module 2026-01-11 14:24:18 +01:00
aff340ee2f migrated checkpoint handling in integrity.rs to ssot principle 2026-01-11 13:05:31 +01:00
0487c2ec49 changed file names for openfigi 2026-01-11 12:21:10 +01:00
04f4b0d0c4 added integrity check to openfigi functions 2026-01-11 00:06:25 +01:00
6f05dc8c99 added integrity check to forex and exchange collection functiosn 2026-01-10 19:46:21 +01:00
ac1345798d added integrity check to cleanse functions 2026-01-10 18:42:39 +01:00
766eb803f1 added integrity check to enrichment functions 2026-01-10 17:40:16 +01:00
151c96e35f working code :) 2026-01-10 15:11:06 +01:00
ae1876b014 cleaned up main 2026-01-10 00:30:59 +01:00
c86d828940 cleaned up main 2026-01-10 00:30:42 +01:00
c6d301d434 added helper functions to reduce bloat 2026-01-09 21:24:18 +01:00
ba841248f0 cleaned up update.rs eco and corp 2026-01-09 19:52:26 +01:00
8dd75f7bdf added yahoo exchange extraction 2026-01-09 19:09:42 +01:00
ea128f6187 added options chart enrichment 2026-01-08 11:35:25 +01:00
1720716144 added event enrichment 2026-01-08 00:35:10 +01:00
f9ce5bad99 fixed yahoo api calls for cleansing low profile data 2026-01-06 00:15:57 +01:00
fc25f32cbc fixed yahoo api calls for cleansing low profile data 2026-01-06 00:15:46 +01:00
3d16475b79 readded yahoo 2026-01-05 17:00:42 +01:00
86944a9c58 cleaned yahoo hits 2025-12-24 00:00:21 +01:00
f9f09d0291 added working hard reset 2025-12-23 15:07:40 +01:00
fb0876309f added hard reset for navigation timeout after 3 hours 2025-12-22 00:31:28 +01:00
c01b47000f removed serial data scraping for yahoo tickers 2025-12-19 16:58:22 +01:00
51 changed files with 11569 additions and 3861 deletions

View File

@@ -14,8 +14,8 @@ CORPORATE_START_DATE=2010-01-01
# How far into the future we scrape economic events (in months) # How far into the future we scrape economic events (in months)
ECONOMIC_LOOKAHEAD_MONTHS=3 ECONOMIC_LOOKAHEAD_MONTHS=3
# Maximum number of parallel scraping tasks (default: 10) # Maximum number of parallel scraping tasks (default: 4)
MAX_PARALLEL_TASKS=10 MAX_PARALLEL_INSTANCES=10
# ===== VPN ROTATION (ProtonVPN Integration) ===== # ===== VPN ROTATION (ProtonVPN Integration) =====
# Enable automatic VPN rotation between sessions? # Enable automatic VPN rotation between sessions?
@@ -38,3 +38,5 @@ TASKS_PER_VPN_SESSION=50
MAX_REQUESTS_PER_SESSION=25 MAX_REQUESTS_PER_SESSION=25
MIN_REQUEST_INTERVAL_MS=300 MIN_REQUEST_INTERVAL_MS=300
MAX_RETRY_ATTEMPTS=3 MAX_RETRY_ATTEMPTS=3
PROXY_INSTANCES_PER_CERTIFICATE=2

1
.gitignore vendored
View File

@@ -35,6 +35,7 @@ target/
**/*.log **/*.log
**/*.ovpn **/*.ovpn
**/*.tmp **/*.tmp
**/*.txt
#/economic_events* #/economic_events*
#/economic_event_changes* #/economic_event_changes*

59
Cargo.lock generated
View File

@@ -2465,9 +2465,9 @@ checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
[[package]] [[package]]
name = "rustix" name = "rustix"
version = "1.1.2" version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34"
dependencies = [ dependencies = [
"bitflags", "bitflags",
"errno", "errno",
@@ -2743,6 +2743,15 @@ dependencies = [
"serde_core", "serde_core",
] ]
[[package]]
name = "serde_spanned"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776"
dependencies = [
"serde_core",
]
[[package]] [[package]]
name = "serde_urlencoded" name = "serde_urlencoded"
version = "0.7.1" version = "0.7.1"
@@ -2974,9 +2983,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]] [[package]]
name = "tempfile" name = "tempfile"
version = "3.23.0" version = "3.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c"
dependencies = [ dependencies = [
"fastrand", "fastrand",
"getrandom 0.3.4", "getrandom 0.3.4",
@@ -3213,10 +3222,25 @@ dependencies = [
] ]
[[package]] [[package]]
name = "toml_datetime" name = "toml"
version = "0.7.3" version = "0.9.11+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" checksum = "f3afc9a848309fe1aaffaed6e1546a7a14de1f935dc9d89d32afd9a44bab7c46"
dependencies = [
"indexmap",
"serde_core",
"serde_spanned",
"toml_datetime",
"toml_parser",
"toml_writer",
"winnow",
]
[[package]]
name = "toml_datetime"
version = "0.7.5+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347"
dependencies = [ dependencies = [
"serde_core", "serde_core",
] ]
@@ -3235,13 +3259,19 @@ dependencies = [
[[package]] [[package]]
name = "toml_parser" name = "toml_parser"
version = "1.0.4" version = "1.0.6+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44"
dependencies = [ dependencies = [
"winnow", "winnow",
] ]
[[package]]
name = "toml_writer"
version = "1.0.6+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
[[package]] [[package]]
name = "tower" name = "tower"
version = "0.5.2" version = "0.5.2"
@@ -3454,6 +3484,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "urlencoding"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]] [[package]]
name = "utf-8" name = "utf-8"
version = "0.7.6" version = "0.7.6"
@@ -3472,6 +3508,7 @@ version = "1.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
dependencies = [ dependencies = [
"getrandom 0.3.4",
"js-sys", "js-sys",
"wasm-bindgen", "wasm-bindgen",
] ]
@@ -3626,11 +3663,15 @@ dependencies = [
"scraper", "scraper",
"serde", "serde",
"serde_json", "serde_json",
"sha2",
"tokio", "tokio",
"tokio-tungstenite 0.21.0", "tokio-tungstenite 0.21.0",
"toml",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"url", "url",
"urlencoding",
"uuid",
"walkdir", "walkdir",
"yfinance-rs", "yfinance-rs",
"zip", "zip",

View File

@@ -17,11 +17,12 @@ categories = ["finance", "data-structures", "asynchronous"]
tokio = { version = "1.38", features = ["full"] } tokio = { version = "1.38", features = ["full"] }
# Web scraping & HTTP # Web scraping & HTTP
reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking"] } reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking", "socks", "cookies"] }
scraper = "0.19" # HTML parsing for Yahoo earnings pages scraper = "0.19" # HTML parsing for Yahoo earnings pages
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
yfinance-rs = "0.7.2" yfinance-rs = "0.7.2"
url = "2.5.7" url = "2.5.7"
urlencoding = "2.1"
# Serialization # Serialization
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
@@ -39,6 +40,7 @@ rand = "0.9.2"
# Environment handling # Environment handling
dotenvy = "0.15" dotenvy = "0.15"
toml = "0.9.8"
# Date & time # Date & time
chrono = { version = "0.4", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] }
@@ -58,3 +60,10 @@ rayon = "1.10" # optional: for parallel price downloads
# Web server for dashboard # Web server for dashboard
axum = { version = "0.7", features = ["ws"] } axum = { version = "0.7", features = ["ws"] }
tokio-tungstenite = "0.21" # For WebSocket support tokio-tungstenite = "0.21" # For WebSocket support
# tests
#tempfile = "3.24.0"
# data integrity
sha2 = "0.10.9"
uuid = { version = "1.0", features = ["v4", "v7"] }

View File

@@ -249,3 +249,7 @@ Der Scraper unterstützt 52 Länder und Regionen (siehe `countries.json`), darun
https://chromedriver.storage.googleapis.com/index.html https://chromedriver.storage.googleapis.com/index.html
https://googlechromelabs.github.io/chrome-for-testing/ https://googlechromelabs.github.io/chrome-for-testing/
## Gaphviz.org Download
https://graphviz.org/download/

View File

@@ -0,0 +1,28 @@
digraph Dependencies {
rankdir=LR;
node [shape=box];
"yahoo_options_enrichment_complete" [label="yahoo_options_enrichment_complete
Options data enriched for all companies"];
"yahoo_events_enrichment_complete" [label="yahoo_events_enrichment_complete
Corporate events enriched for all companies"];
"yahoo_companies_cleansed_no_data" [label="yahoo_companies_cleansed_no_data
Companies cleansed of data with no Yahoo results"];
"yahoo_chart_enrichment_complete" [label="yahoo_chart_enrichment_complete
Chart data enriched for all companies"];
"enrichment_group" [label="enrichment_group
Yahoo exchanges collected and validated"];
"yahoo_companies_cleansed_low_profile" [label="yahoo_companies_cleansed_low_profile
Companies cleansed of low profile (insufficient market cap/price data)"];
"lei_figi_mapping_complete" [label="lei_figi_mapping_complete
LEI-to-FIGI mappings from OpenFIGI API"];
"securities_data_complete" [label="securities_data_complete
Securities data built from FIGI mappings"];
"yahoo_options_enrichment_complete" -> "yahoo_companies_cleansed_low_profile" [label="via group enrichment_group"];
"yahoo_events_enrichment_complete" -> "yahoo_companies_cleansed_low_profile" [label="via group enrichment_group"];
"yahoo_companies_cleansed_no_data" -> "securities_data_complete";
"yahoo_chart_enrichment_complete" -> "yahoo_companies_cleansed_low_profile" [label="via group enrichment_group"];
"yahoo_companies_cleansed_low_profile" -> "yahoo_companies_cleansed_no_data";
"securities_data_complete" -> "lei_figi_mapping_complete";
}

View File

@@ -0,0 +1,61 @@
# checkpoint_dependencies.toml - Complete configuration
# ============================================================================
# COLLECTION STAGE (No dependencies)
# ============================================================================
[checkpoints.lei_figi_mapping_complete]
description = "LEI-to-FIGI mappings from OpenFIGI API"
depends_on = []
[checkpoints.securities_data_complete]
description = "Securities data built from FIGI mappings"
depends_on = ["lei_figi_mapping_complete"]
# ============================================================================
# CLEANSING STAGE (Depends on collection)
# ============================================================================
[checkpoints.yahoo_companies_cleansed_no_data]
description = "Companies cleansed of data with no Yahoo results"
depends_on = ["securities_data_complete"]
[checkpoints.yahoo_companies_cleansed_low_profile]
description = "Companies cleansed of low profile (insufficient market cap/price data)"
depends_on = ["yahoo_companies_cleansed_no_data"]
# ============================================================================
# ENRICHMENT GROUP (All depend on cleansed companies)
# ============================================================================
[groups.enrichment_group]
description = "Yahoo Finance enrichment functions"
members = [
"yahoo_events_enrichment_complete",
"yahoo_options_enrichment_complete",
"yahoo_chart_enrichment_complete"
]
depends_on = ["yahoo_companies_cleansed_low_profile"]
[checkpoints.yahoo_events_enrichment_complete]
description = "Corporate events enriched for all companies"
depends_on = []
group = "enrichment_group"
[checkpoints.yahoo_options_enrichment_complete]
description = "Options data enriched for all companies"
depends_on = []
group = "enrichment_group"
[checkpoints.yahoo_chart_enrichment_complete]
description = "Chart data enriched for all companies"
depends_on = []
group = "enrichment_group"
# ============================================================================
# SECURITIES PROCESSING (Depends on LEI mapping)
# ============================================================================
[checkpoints.enrichment_group]
description = "Yahoo exchanges collected and validated"
depends_on = []

View File

@@ -27,6 +27,26 @@ pub struct Config {
#[serde(default = "default_max_retry_attempts")] #[serde(default = "default_max_retry_attempts")]
pub max_retry_attempts: u32, pub max_retry_attempts: u32,
#[serde(default = "default_proxy_instances_per_certificate")]
pub proxy_instances_per_certificate: Option<usize>,
}
impl Default for Config {
fn default() -> Self {
Self {
economic_start_date: "2007-02-13".to_string(),
corporate_start_date: "2010-01-01".to_string(),
economic_lookahead_months: 3,
max_parallel_instances: default_max_parallel_instances(),
max_tasks_per_instance: 0,
max_requests_per_session: default_max_requests_per_session(),
min_request_interval_ms: default_min_request_interval_ms(),
max_retry_attempts: default_max_retry_attempts(),
enable_vpn_rotation: false,
proxy_instances_per_certificate: default_proxy_instances_per_certificate(),
}
}
} }
fn default_enable_vpn_rotation() -> bool { fn default_enable_vpn_rotation() -> bool {
@@ -47,24 +67,10 @@ fn default_min_request_interval_ms() -> u64 {
fn default_max_retry_attempts() -> u32 { 3 } fn default_max_retry_attempts() -> u32 { 3 }
impl Default for Config { fn default_proxy_instances_per_certificate() -> Option<usize> {
fn default() -> Self { Some(1)
Self {
economic_start_date: "2007-02-13".to_string(),
corporate_start_date: "2010-01-01".to_string(),
economic_lookahead_months: 3,
max_parallel_instances: default_max_parallel_instances(),
max_tasks_per_instance: 0,
max_requests_per_session: default_max_requests_per_session(),
min_request_interval_ms: default_min_request_interval_ms(),
max_retry_attempts: default_max_retry_attempts(),
enable_vpn_rotation: false,
}
}
} }
impl Config { impl Config {
/// Loads configuration from environment variables using dotenvy. /// Loads configuration from environment variables using dotenvy.
pub fn load() -> Result<Self> { pub fn load() -> Result<Self> {
@@ -112,6 +118,11 @@ impl Config {
.parse() .parse()
.context("Failed to parse MAX_RETRY_ATTEMPTS as u32")?; .context("Failed to parse MAX_RETRY_ATTEMPTS as u32")?;
let proxy_instances_per_certificate: Option<usize> = match dotenvy::var("PROXY_INSTANCES_PER_CERTIFICATE") {
Ok(val) => Some(val.parse().context("Failed to parse PROXY_INSTANCES_PER_CERTIFICATE as usize")?),
Err(_) => Some(1),
};
Ok(Self { Ok(Self {
economic_start_date, economic_start_date,
corporate_start_date, corporate_start_date,
@@ -122,6 +133,7 @@ impl Config {
max_requests_per_session, max_requests_per_session,
min_request_interval_ms, min_request_interval_ms,
max_retry_attempts, max_retry_attempts,
proxy_instances_per_certificate,
}) })
} }

View File

@@ -1,195 +0,0 @@
// src/corporate/aggregation.rs
use super::types::CompanyPrice;
use super::storage::*;
use crate::util::directories::DataPaths;
use tokio::fs;
use std::collections::HashMap;
#[derive(Debug)]
struct DayData {
sources: Vec<(CompanyPrice, String)>, // (price, source_ticker)
total_volume: u64,
vwap: f64,
open: f64,
high: f64,
low: f64,
close: f64,
}
/// Aggregate price data from multiple exchanges, converting all to USD
pub async fn aggregate_best_price_data(paths: &DataPaths, lei: &str) -> anyhow::Result<()> {
let company_dir = get_company_dir(paths, lei);
for timeframe in ["daily", "5min"].iter() {
let source_dir = company_dir.join(timeframe);
if !source_dir.exists() {
continue;
}
let mut all_prices: Vec<(CompanyPrice, String)> = Vec::new();
let mut by_date_time: HashMap<String, DayData> = HashMap::new();
// Load all sources with their ticker names
let mut entries = tokio::fs::read_dir(&source_dir).await?;
let mut source_count = 0;
let mut sources_used = std::collections::HashSet::new();
while let Some(entry) = entries.next_entry().await? {
let source_dir_path = entry.path();
if !source_dir_path.is_dir() { continue; }
let source_ticker = source_dir_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown")
.to_string();
let prices_path = source_dir_path.join("prices.json");
if !prices_path.exists() { continue; }
let content = tokio::fs::read_to_string(&prices_path).await?;
let mut prices: Vec<CompanyPrice> = serde_json::from_str(&content)?;
if !prices.is_empty() {
sources_used.insert(source_ticker.clone());
source_count += 1;
}
for price in prices {
all_prices.push((price, source_ticker.clone()));
}
}
if all_prices.is_empty() {
continue;
}
println!(" Aggregating from {} exchanges: {}",
sources_used.len(),
sources_used.iter()
.map(|s| s.as_str())
.collect::<Vec<_>>()
.join(", ")
);
// Group by date + time (for 5min) or just date
for (p, source) in all_prices {
let key = if timeframe == &"5min" && !p.time.is_empty() {
format!("{}_{}", p.date, p.time)
} else {
p.date.clone()
};
// Convert to USD immediately
let usd_rate = super::fx::get_usd_rate(&p.currency).await.unwrap_or(1.0);
let mut p_usd = p.clone();
p_usd.open *= usd_rate;
p_usd.high *= usd_rate;
p_usd.low *= usd_rate;
p_usd.close *= usd_rate;
p_usd.adj_close *= usd_rate;
p_usd.currency = "USD".to_string();
let entry = by_date_time.entry(key.clone()).or_insert(DayData {
sources: vec![],
total_volume: 0,
vwap: 0.0,
open: p_usd.open,
high: p_usd.high,
low: p_usd.low,
close: p_usd.close,
});
let volume = p.volume.max(1); // avoid div0
let vwap_contrib = p_usd.close * volume as f64;
entry.sources.push((p_usd.clone(), source));
entry.total_volume += volume;
entry.vwap += vwap_contrib;
// Use first open, last close, max high, min low
if entry.sources.len() == 1 {
entry.open = p_usd.open;
}
entry.close = p_usd.close;
entry.high = entry.high.max(p_usd.high);
entry.low = entry.low.min(p_usd.low);
}
// Finalize aggregated data
let mut aggregated: Vec<CompanyPrice> = Vec::new();
for (key, data) in by_date_time {
let vwap = data.vwap / data.total_volume as f64;
let (date, time) = if key.contains('_') {
let parts: Vec<&str> = key.split('_').collect();
(parts[0].to_string(), parts[1].to_string())
} else {
(key, "".to_string())
};
// Track which exchange contributed most volume
let best_source = data.sources.iter()
.max_by_key(|(p, _)| p.volume)
.map(|(_, src)| src.clone())
.unwrap_or_else(|| "unknown".to_string());
aggregated.push(CompanyPrice {
ticker: format!("{lei}@agg"), // Mark as aggregated
date,
time,
open: data.open,
high: data.high,
low: data.low,
close: data.close,
adj_close: vwap,
volume: data.total_volume,
currency: "USD".to_string(),
});
}
aggregated.sort_by_key(|p| (p.date.clone(), p.time.clone()));
// Save aggregated result
let agg_dir = company_dir.join("aggregated").join(timeframe);
fs::create_dir_all(&agg_dir).await?;
let path = agg_dir.join("prices.json");
fs::write(&path, serde_json::to_string_pretty(&aggregated)?).await?;
// Save aggregation metadata
let meta = AggregationMetadata {
lei: lei.to_string(), // ← CHANGE THIS
timeframe: timeframe.to_string(),
sources: sources_used.into_iter().collect(),
total_bars: aggregated.len(),
date_range: (
aggregated.first().map(|p| p.date.clone()).unwrap_or_default(),
aggregated.last().map(|p| p.date.clone()).unwrap_or_default(),
),
aggregated_at: chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
};
let meta_path = agg_dir.join("metadata.json");
fs::write(&meta_path, serde_json::to_string_pretty(&meta)?).await?;
println!("{} {} bars from {} sources (USD)",
aggregated.len(),
timeframe,
source_count
);
}
Ok(())
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
struct AggregationMetadata {
lei: String,
timeframe: String,
sources: Vec<String>,
total_bars: usize,
date_range: (String, String),
aggregated_at: String,
}

View File

@@ -1,346 +0,0 @@
// src/corporate/atomic_writer.rs
//
// Atomic JSONL writer that prevents partial/corrupted results from being written
use anyhow::Result;
use serde::Serialize;
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use tokio::fs::{File, OpenOptions};
use tokio::io::AsyncWriteExt;
use tokio::sync::mpsc;
/// Command to write or validate data
#[derive(Debug)]
pub enum WriteCommand<T> {
/// Stage a result for writing (held in memory until committed)
Stage { id: String, data: T },
/// Commit staged result to disk (atomic write)
Commit { id: String },
/// Rollback staged result (discard without writing)
Rollback { id: String },
/// Commit all pending staged results and flush
CommitAll,
/// Shutdown writer gracefully (only commits valid staged results)
Shutdown,
}
/// Result of a write operation
#[derive(Debug)]
pub struct WriteResult {
pub id: String,
pub success: bool,
pub error: Option<String>,
}
/// Atomic writer that prevents partial results from being written
pub struct AtomicJsonlWriter<T> {
file: File,
staged: HashMap<String, T>,
committed_count: usize,
rollback_count: usize,
}
impl<T: Serialize + Clone> AtomicJsonlWriter<T> {
pub async fn new(path: PathBuf) -> Result<Self> {
// Ensure parent directory exists
if let Some(parent) = path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
let file = OpenOptions::new()
.create(true)
.append(true)
.open(&path)
.await?;
crate::util::logger::log_info(&format!(
"Atomic writer initialized: {:?}",
path
)).await;
Ok(Self {
file,
staged: HashMap::new(),
committed_count: 0,
rollback_count: 0,
})
}
/// Stage data for writing (held in memory, not yet written)
pub async fn stage(&mut self, id: String, data: T) {
crate::util::logger::log_info(&format!(
"Staging result for: {} (total staged: {})",
id,
self.staged.len() + 1
)).await;
self.staged.insert(id, data);
}
/// Commit a staged result to disk (atomic write)
pub async fn commit(&mut self, id: &str) -> Result<()> {
if let Some(data) = self.staged.remove(id) {
// Serialize to JSON
let json_line = serde_json::to_string(&data)?;
// Write atomically (single syscall)
self.file.write_all(json_line.as_bytes()).await?;
self.file.write_all(b"\n").await?;
self.file.flush().await?;
self.committed_count += 1;
crate::util::logger::log_info(&format!(
"✓ Committed result for: {} (total committed: {})",
id, self.committed_count
)).await;
Ok(())
} else {
Err(anyhow::anyhow!("No staged result found for id: {}", id))
}
}
/// Rollback a staged result (discard without writing)
pub async fn rollback(&mut self, id: &str) {
if self.staged.remove(id).is_some() {
self.rollback_count += 1;
crate::util::logger::log_warn(&format!(
"⚠ Rolled back result for: {} (total rollbacks: {})",
id, self.rollback_count
)).await;
}
}
/// Commit all staged results
pub async fn commit_all(&mut self) -> Result<usize> {
let ids: Vec<String> = self.staged.keys().cloned().collect();
let mut committed = 0;
for id in ids {
if let Ok(()) = self.commit(&id).await {
committed += 1;
}
}
Ok(committed)
}
/// Rollback all staged results (discard everything)
pub async fn rollback_all(&mut self) -> usize {
let count = self.staged.len();
self.staged.clear();
self.rollback_count += count;
crate::util::logger::log_warn(&format!(
"⚠ Rolled back all {} staged results",
count
)).await;
count
}
/// Get statistics
pub fn stats(&self) -> WriterStats {
WriterStats {
staged_count: self.staged.len(),
committed_count: self.committed_count,
rollback_count: self.rollback_count,
}
}
}
#[derive(Debug, Clone)]
pub struct WriterStats {
pub staged_count: usize,
pub committed_count: usize,
pub rollback_count: usize,
}
/// Managed writer service that runs in its own task
pub struct AtomicWriterService<T> {
rx: mpsc::UnboundedReceiver<WriteCommand<T>>,
writer: AtomicJsonlWriter<T>,
shutdown_flag: Arc<AtomicBool>,
}
impl<T: Serialize + Clone> AtomicWriterService<T> {
pub async fn new(
path: PathBuf,
rx: mpsc::UnboundedReceiver<WriteCommand<T>>,
shutdown_flag: Arc<AtomicBool>,
) -> Result<Self> {
let writer = AtomicJsonlWriter::new(path).await?;
Ok(Self {
rx,
writer,
shutdown_flag,
})
}
/// Main service loop
pub async fn run(mut self) {
crate::util::logger::log_info("Atomic writer service started").await;
while let Some(cmd) = self.rx.recv().await {
// Check for shutdown flag
if self.shutdown_flag.load(Ordering::SeqCst) {
crate::util::logger::log_warn(
"Shutdown detected - processing only Commit/Rollback commands"
).await;
// Only process commit/rollback commands during shutdown
match cmd {
WriteCommand::Commit { id } => {
if let Err(e) = self.writer.commit(&id).await {
crate::util::logger::log_error(&format!(
"Failed to commit {}: {}",
id, e
)).await;
}
}
WriteCommand::Rollback { id } => {
self.writer.rollback(&id).await;
}
WriteCommand::CommitAll => {
match self.writer.commit_all().await {
Ok(count) => {
crate::util::logger::log_info(&format!(
"Committed {} results during shutdown",
count
)).await;
}
Err(e) => {
crate::util::logger::log_error(&format!(
"Failed to commit all: {}",
e
)).await;
}
}
}
WriteCommand::Shutdown => break,
_ => {
// Ignore Stage commands during shutdown
crate::util::logger::log_warn(
"Ignoring new Stage command during shutdown"
).await;
}
}
continue;
}
// Normal operation
match cmd {
WriteCommand::Stage { id, data } => {
self.writer.stage(id, data).await;
}
WriteCommand::Commit { id } => {
if let Err(e) = self.writer.commit(&id).await {
crate::util::logger::log_error(&format!(
"Failed to commit {}: {}",
id, e
)).await;
}
}
WriteCommand::Rollback { id } => {
self.writer.rollback(&id).await;
}
WriteCommand::CommitAll => {
match self.writer.commit_all().await {
Ok(count) => {
crate::util::logger::log_info(&format!(
"Committed all {} staged results",
count
)).await;
}
Err(e) => {
crate::util::logger::log_error(&format!(
"Failed to commit all: {}",
e
)).await;
}
}
}
WriteCommand::Shutdown => break,
}
}
// Final shutdown - rollback any remaining staged items
let stats = self.writer.stats();
if stats.staged_count > 0 {
crate::util::logger::log_warn(&format!(
"⚠ Shutdown with {} uncommitted results - rolling back",
stats.staged_count
)).await;
self.writer.rollback_all().await;
}
crate::util::logger::log_info(&format!(
"Atomic writer service stopped. Final stats: {} committed, {} rolled back",
stats.committed_count,
stats.rollback_count
)).await;
}
}
/// Handle for sending write commands
#[derive(Clone)]
pub struct AtomicWriterHandle<T> {
tx: mpsc::UnboundedSender<WriteCommand<T>>,
}
impl<T> AtomicWriterHandle<T> {
pub fn new(tx: mpsc::UnboundedSender<WriteCommand<T>>) -> Self {
Self { tx }
}
/// Stage data for writing (does not write immediately)
pub fn stage(&self, id: String, data: T) {
let _ = self.tx.send(WriteCommand::Stage { id, data });
}
/// Commit staged data to disk
pub fn commit(&self, id: String) {
let _ = self.tx.send(WriteCommand::Commit { id });
}
/// Rollback staged data (discard)
pub fn rollback(&self, id: String) {
let _ = self.tx.send(WriteCommand::Rollback { id });
}
/// Commit all staged data
pub fn commit_all(&self) {
let _ = self.tx.send(WriteCommand::CommitAll);
}
/// Shutdown writer gracefully
pub fn shutdown(&self) {
let _ = self.tx.send(WriteCommand::Shutdown);
}
}
/// Create atomic writer service
pub async fn create_atomic_writer<T: Serialize + Clone + Send + 'static>(
path: PathBuf,
shutdown_flag: Arc<AtomicBool>,
) -> Result<(AtomicWriterHandle<T>, tokio::task::JoinHandle<()>)> {
let (tx, rx) = mpsc::unbounded_channel();
let service = AtomicWriterService::new(path, rx, shutdown_flag).await?;
let handle = tokio::spawn(async move {
service.run().await;
});
Ok((AtomicWriterHandle::new(tx), handle))
}

View File

@@ -0,0 +1,273 @@
// src/corporate/bond_processing.rs
// Bond-specific processing logic for corporate and government bonds
use super::types::*;
/// Parse bond details from ticker and security description
///
/// Examples:
/// - "WTFC 4.3 01/12/26 0003" -> coupon: 4.3, maturity: 2026-01-12
/// - "SLOVAK 1.5225 05/10/28 4Y" -> coupon: 1.5225, maturity: 2028-05-10
/// - "SEK Float 06/30/34" -> floating rate, maturity: 2034-06-30
/// - "GGB 0 10/15/42" -> zero coupon, maturity: 2042-10-15
pub fn parse_bond_details(ticker: &str, security_description: &str) -> BondDetails {
let mut details = BondDetails {
coupon_rate: None,
maturity_date: None,
is_floating: false,
is_zero_coupon: false,
tenor_years: None,
series_identifier: None,
};
// Check for floating rate - look for "Float", " F ", "V0" patterns
if ticker.contains("Float") || ticker.contains(" F ") || ticker.contains(" V0 ")
|| security_description.contains("Float") {
details.is_floating = true;
}
// Parse coupon rate if not floating
if !details.is_floating {
if let Some(coupon) = extract_coupon_rate(ticker, security_description) {
details.coupon_rate = Some(coupon);
details.is_zero_coupon = coupon == 0.0;
}
}
// Parse maturity date
if let Some(maturity) = extract_maturity_date(ticker, security_description) {
details.maturity_date = Some(maturity.clone());
// Calculate tenor (simplified - just extract year)
if let Some(year_str) = maturity.split('-').next() {
if let Ok(mat_year) = year_str.parse::<i32>() {
let current_year = 2026; // From system prompt
let years_to_maturity = (mat_year - current_year).max(0) as u32;
details.tenor_years = Some(years_to_maturity);
}
}
}
// Extract series identifier
details.series_identifier = extract_series_identifier(ticker);
details
}
/// Extract coupon rate from ticker/description
/// Handles: "4.3", "1.5225", "12 1/2" (fractional), "0"
fn extract_coupon_rate(ticker: &str, description: &str) -> Option<f64> {
let text = format!("{} {}", ticker, description);
// Pattern 1: Fractional rates like "12 1/2" -> 12.5
if let Some(frac_result) = parse_fractional_coupon(&text) {
return Some(frac_result);
}
// Pattern 2: Decimal rates like "4.3" or "1.5225"
// Look for number followed by space and date pattern
let parts: Vec<&str> = text.split_whitespace().collect();
for i in 0..parts.len() {
if let Ok(rate) = parts[i].parse::<f64>() {
// Sanity check: coupon rates are typically 0-20%
if rate >= 0.0 && rate <= 20.0 {
// Make sure it's before a date-like pattern
if i + 1 < parts.len() {
let next = parts[i + 1];
if next.contains('/') || next.len() >= 8 {
return Some(rate);
}
}
}
}
}
None
}
/// Parse fractional coupon like "12 1/2" -> 12.5
fn parse_fractional_coupon(text: &str) -> Option<f64> {
let parts: Vec<&str> = text.split_whitespace().collect();
for i in 0..parts.len().saturating_sub(1) {
// Check if current part is a number
if let Ok(whole) = parts[i].parse::<f64>() {
// Check if next part is a fraction like "1/2"
if let Some(slash_pos) = parts[i + 1].find('/') {
let frac_str = parts[i + 1];
let num_str = &frac_str[..slash_pos];
let den_str = &frac_str[slash_pos + 1..];
if let (Ok(num), Ok(den)) = (num_str.parse::<f64>(), den_str.parse::<f64>()) {
if den != 0.0 {
return Some(whole + num / den);
}
}
}
}
}
None
}
/// Extract maturity date from ticker/description
/// Handles: "01/12/26", "05/10/28", "06/30/2034"
fn extract_maturity_date(ticker: &str, description: &str) -> Option<String> {
let text = format!("{} {}", ticker, description);
// Look for MM/DD/YY or MM/DD/YYYY patterns
let parts: Vec<&str> = text.split_whitespace().collect();
for part in parts {
if let Some(date) = parse_date_pattern(part) {
return Some(date);
}
}
None
}
/// Parse various date formats to YYYY-MM-DD
fn parse_date_pattern(s: &str) -> Option<String> {
let slash_count = s.matches('/').count();
if slash_count != 2 {
return None;
}
let parts: Vec<&str> = s.split('/').collect();
if parts.len() != 3 {
return None;
}
let month = parts[0];
let day = parts[1];
let year_part = parts[2];
// Parse year - could be 2 or 4 digits
let year = if year_part.len() == 2 {
if let Ok(yy) = year_part.parse::<u32>() {
// Assume 20xx for values <= 50, 19xx for > 50
if yy <= 50 {
format!("{}", 2000 + yy)
} else {
format!("{}", 1900 + yy)
}
} else {
return None;
}
} else if year_part.len() == 4 {
year_part.to_string()
} else {
return None;
};
// Validate month and day
if let (Ok(m), Ok(d)) = (month.parse::<u32>(), day.parse::<u32>()) {
if m >= 1 && m <= 12 && d >= 1 && d <= 31 {
return Some(format!("{}-{:02}-{:02}", year, m, d));
}
}
None
}
/// Extract series identifier (tokens after the date)
/// Examples: "0003", "4Y", "144A", "REGS", "MTN", "PSI", "CD"
fn extract_series_identifier(ticker: &str) -> Option<String> {
let parts: Vec<&str> = ticker.split_whitespace().collect();
// Look for date pattern, then take what comes after
for i in 0..parts.len() {
if parts[i].contains('/') && parts[i].matches('/').count() == 2 {
// Found date, check if there's something after
if i + 1 < parts.len() {
return Some(parts[i + 1].to_string());
}
}
}
None
}
/// Classify government issuer type
pub fn classify_government_issuer(name: &str) -> String {
let name_lower = name.to_lowercase();
// Sovereign nations
if name_lower.contains("republic")
|| name_lower.contains("kingdom")
|| name_lower.contains("federal republic")
|| name_lower.ends_with(" govt")
|| name_lower.ends_with(" government")
|| name_lower.contains("hellenic") // Greece
|| name_lower.contains("slovak") {
return "sovereign".to_string();
}
// Municipalities (Norwegian communes, cities, etc.)
if name_lower.contains("kommune")
|| name_lower.contains("municipality")
|| name_lower.contains("city of")
|| name_lower.contains("town of")
|| name_lower.contains("county council") {
return "municipal".to_string();
}
// States/Provinces/Regions
if name_lower.contains("state of")
|| name_lower.contains("province")
|| name_lower.contains("region")
|| name_lower.contains("county") {
return "state".to_string();
}
// Government agencies/entities
if name_lower.contains("export credit")
|| name_lower.contains("development bank")
|| name_lower.contains("housing")
|| name_lower.contains("akademiska")
|| name_lower.contains("byggdastofnun") {
return "agency".to_string();
}
"other".to_string()
}
/// Classify government bond type based on security_type
///
/// Maps OpenFIGI security types to simplified bond categories for government bonds
///
/// # Examples
/// - "DOMESTIC" -> "domestic"
/// - "GLOBAL" -> "global"
/// - "EURO NON-DOLLAR" -> "euro"
/// - "DOMESTIC MTN" -> "mtn"
pub fn classify_government_bond_type(security_type: &str) -> String {
let security_type_upper = security_type.to_uppercase();
if security_type_upper.contains("GLOBAL") {
return "global".to_string();
}
if security_type_upper.contains("EURO") {
if security_type_upper.contains("NON-DOLLAR") || !security_type_upper.contains("DOLLAR") {
return "euro".to_string();
}
return "eurodollar".to_string();
}
if security_type_upper.contains("YANKEE") {
return "yankee".to_string();
}
if security_type_upper.contains("MTN") {
return "mtn".to_string();
}
if security_type_upper.contains("DOMESTIC") {
return "domestic".to_string();
}
"other".to_string()
}

View File

@@ -0,0 +1,215 @@
// src/corporate/checkpoint_helpers.rs
//! Shared helpers for checkpoint-based recovery and logging
//!
//! This module extracts common patterns used across multiple update modules
//! to reduce code duplication and improve maintainability.
use super::types::CompanyData;
use crate::util::logger;
use std::collections::HashMap;
use std::path::{Path};
use tokio::fs::{File};
use tokio::io::{AsyncWriteExt};
use anyhow::Result;
/// Load companies from checkpoint and replay log for recovery
///
/// This function implements the checkpoint + write-ahead log pattern:
/// 1. Loads the main checkpoint file
/// 2. Replays any pending updates from the log file
/// 3. Returns the merged state
pub async fn load_checkpoint_with_log<P1, P2>(
checkpoint_path: P1,
log_path: P2,
checkpoint_desc: &str,
) -> Result<HashMap<String, CompanyData>>
where
P1: AsRef<Path>,
P2: AsRef<Path>,
{
let checkpoint_path = checkpoint_path.as_ref();
let log_path = log_path.as_ref();
let mut companies: HashMap<String, CompanyData> = HashMap::new();
// Load checkpoint if it exists
if checkpoint_path.exists() {
logger::log_info(&format!("Loading checkpoint from {}...", checkpoint_desc)).await;
let content = tokio::fs::read_to_string(checkpoint_path).await?;
for line in content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
companies.insert(company.name.clone(), company);
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
}
}
}
logger::log_info(&format!("Loaded checkpoint with {} companies", companies.len())).await;
}
// Replay log if it exists
if log_path.exists() {
logger::log_info("Replaying update log...").await;
let log_content = tokio::fs::read_to_string(log_path).await?;
let mut replayed = 0;
for line in log_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
companies.insert(company.name.clone(), company);
replayed += 1;
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
if replayed > 0 {
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
}
}
Ok(companies)
}
/// Consolidate log into checkpoint and clear log
///
/// Atomically writes all companies to a new checkpoint file and removes the log.
/// Uses atomic rename to ensure crash safety.
pub async fn consolidate_checkpoint<P1, P2>(
checkpoint_path: P1,
log_path: P2,
companies: &HashMap<String, CompanyData>,
) -> Result<()>
where
P1: AsRef<Path>,
P2: AsRef<Path>,
{
let checkpoint_path = checkpoint_path.as_ref();
let log_path = log_path.as_ref();
logger::log_info("Consolidating update log into checkpoint...").await;
let temp_checkpoint = checkpoint_path.with_extension("tmp");
let mut temp_file = File::create(&temp_checkpoint).await?;
for company in companies.values() {
let json_line = serde_json::to_string(company)?;
temp_file.write_all(json_line.as_bytes()).await?;
temp_file.write_all(b"\n").await?;
}
temp_file.flush().await?;
temp_file.sync_data().await?;
drop(temp_file);
tokio::fs::rename(&temp_checkpoint, checkpoint_path).await?;
// Remove log after successful consolidation
if log_path.exists() {
tokio::fs::remove_file(log_path).await.ok();
}
logger::log_info(&format!("✓ Consolidated {} companies", companies.len())).await;
Ok(())
}
/// Check if log file has content
pub async fn log_has_content<P: AsRef<Path>>(log_path: P) -> bool {
if let Ok(metadata) = tokio::fs::metadata(log_path.as_ref()).await {
metadata.len() > 0
} else {
false
}
}
/// Load enrichment progress from log file
///
/// Used by enrichment functions to track which companies have already been processed.
/// Parses log entries with format: {"company_name": "...", "status": "enriched", ...}
pub async fn load_enrichment_progress<P>(
log_path: P,
) -> Result<std::collections::HashSet<String>>
where
P: AsRef<Path>,
{
let mut enriched_companies = std::collections::HashSet::new();
if !log_path.as_ref().exists() {
return Ok(enriched_companies);
}
logger::log_info("Loading enrichment progress from log...").await;
let log_content = tokio::fs::read_to_string(log_path.as_ref()).await?;
for line in log_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<serde_json::Value>(line) {
Ok(entry) => {
if let Some(name) = entry.get("company_name").and_then(|v| v.as_str()) {
if entry.get("status").and_then(|v| v.as_str()) == Some("enriched") {
enriched_companies.insert(name.to_string());
}
}
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
logger::log_info(&format!(
"Loaded {} enriched companies from log",
enriched_companies.len()
)).await;
Ok(enriched_companies)
}
/// Count enriched companies by checking for data files
///
/// Walks through the corporate directory and counts companies that have
/// a data file in the specified subdirectory (e.g., "events", "options", "chart").
pub async fn count_enriched_companies(
paths: &crate::util::directories::DataPaths,
data_type: &str,
) -> Result<usize> {
let corporate_dir = paths.corporate_dir();
if !corporate_dir.exists() {
return Ok(0);
}
let mut count = 0;
let mut entries = tokio::fs::read_dir(&corporate_dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
let data_dir = path.join(data_type);
let data_file = data_dir.join("data.jsonl");
if data_file.exists() {
count += 1;
}
}
}
Ok(count)
}

View File

@@ -0,0 +1,720 @@
// src/corporate/collect_exchanges.rs
use crate::util::directories::DataPaths;
use crate::util::integrity::{DataStage, StateEntry, StateManager, file_reference};
use crate::util::logger;
use crate::corporate::types::*;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio::fs;
use tokio::io::AsyncWriteExt;
/// Exchange information collected from company data
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExchangeInfo {
#[serde(rename = "exchangeName")]
pub exchange_name: String,
pub currency: String,
#[serde(rename = "currencySymbol")]
pub currency_symbol: String,
#[serde(rename = "exchangeDataDelayedBy")]
pub exchange_data_delayed_by: i64,
#[serde(rename = "totalMarketCap")]
pub total_market_cap: u64,
#[serde(rename = "totalMarketCapUSD")]
pub total_market_cap_usd: f64, // NEW: Market cap converted to USD
pub companies: Vec<String>,
}
/// Extract exchange data from company core data
#[derive(Debug, Deserialize)]
struct CompanyCoreData {
modules: Option<CoreModules>,
}
#[derive(Debug, Deserialize)]
struct CoreModules {
price: Option<PriceModule>,
}
#[derive(Debug, Deserialize)]
struct PriceModule {
#[serde(rename = "exchangeName")]
exchange_name: Option<String>,
currency: Option<String>,
#[serde(rename = "currencySymbol")]
currency_symbol: Option<String>,
exchange: Option<String>,
#[serde(rename = "exchangeDataDelayedBy")]
exchange_data_delayed_by: Option<i64>,
#[serde(rename = "marketCap")]
market_cap: Option<MarketCapData>,
}
#[derive(Debug, Deserialize)]
struct MarketCapData {
raw: Option<u64>,
}
/// Normalize currency code and get conversion factor
/// Handles special cases like GBp (pence) and ZAc (cents)
fn normalize_currency(currency: &str) -> (&str, f64) {
match currency {
"GBp" => ("GBP", 100.0), // British Pence -> Pounds (divide by 100)
"ZAc" => ("ZAR", 100.0), // South African Cents -> Rand (divide by 100)
_ => (currency, 1.0), // No conversion needed
}
}
/// FX rate cache for currency conversion
struct FxRateCache {
rates: HashMap<String, f64>,
}
impl FxRateCache {
/// Create new FX rate cache by loading all currency charts
async fn new(paths: &DataPaths) -> anyhow::Result<Self> {
let mut rates = HashMap::new();
// USD to USD is always 1.0
rates.insert("USD".to_string(), 1.0);
let currency_dir = paths.data_dir().join("economic").join("currency");
if !currency_dir.exists() {
logger::log_warn(" FX rates directory not found - will use default rates").await;
return Ok(Self { rates });
}
let mut entries = fs::read_dir(&currency_dir).await?;
let mut loaded_count = 0;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if !path.is_dir() {
continue;
}
let currency_code = match path.file_name().and_then(|n| n.to_str()) {
Some(code) => code.to_string(),
None => continue,
};
let chart_path = path.join("chart").join("data.jsonl");
if !chart_path.exists() {
continue;
}
// Load chart and get latest rate
match load_latest_fx_rate(&chart_path).await {
Ok(rate) => {
rates.insert(currency_code.clone(), rate);
loaded_count += 1;
}
Err(e) => {
logger::log_warn(&format!(
" Failed to load FX rate for {}: {}",
currency_code, e
)).await;
}
}
}
logger::log_info(&format!(" ✓ Loaded {} FX rates", loaded_count)).await;
Ok(Self { rates })
}
/// Convert amount from given currency to USD
fn to_usd(&self, amount: u64, currency: &str) -> f64 {
// Normalize currency and get conversion factor
// e.g., GBp -> (GBP, 100.0), ZAc -> (ZAR, 100.0)
let (normalized_currency, factor) = normalize_currency(currency);
// First convert to base currency unit (e.g., pence to pounds)
let amount_in_base = amount as f64 / factor;
if normalized_currency == "USD" {
return amount_in_base;
}
// Get rate (USD per currency unit)
// For USD/EUR = 0.92, this means 1 USD = 0.92 EUR
// To convert EUR to USD: EUR_amount / 0.92
match self.rates.get(normalized_currency) {
Some(&rate) if rate > 0.0 => {
amount_in_base / rate
}
_ => {
// Fallback: use approximate rates for common currencies
let fallback_rate = get_fallback_rate(normalized_currency);
amount_in_base / fallback_rate
}
}
}
/// Get rate for a currency (USD per unit)
fn get_rate(&self, currency: &str) -> Option<f64> {
let (normalized_currency, _) = normalize_currency(currency);
self.rates.get(normalized_currency).copied()
}
}
/// Load latest FX rate from chart data
async fn load_latest_fx_rate(chart_path: &std::path::Path) -> anyhow::Result<f64> {
let content = fs::read_to_string(chart_path).await?;
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
let chart: ChartData = serde_json::from_str(line)?;
if chart.quotes.is_empty() {
return Err(anyhow::anyhow!("No quotes in chart data"));
}
// Get most recent quote with a close price
let latest_rate = chart.quotes
.iter()
.rev()
.find_map(|q| q.close)
.ok_or_else(|| anyhow::anyhow!("No valid close prices"))?;
return Ok(latest_rate);
}
Err(anyhow::anyhow!("No data in chart file"))
}
/// Fallback rates for common currencies (approximate, as of 2024)
/// These are USD per currency unit (same format as our FX data)
fn get_fallback_rate(currency: &str) -> f64 {
match currency {
"USD" => 1.0,
"EUR" => 0.92, // 1 USD = 0.92 EUR
"GBP" => 0.79, // 1 USD = 0.79 GBP
"JPY" => 150.0, // 1 USD = 150 JPY
"CNY" | "RMB" => 7.2,
"CHF" => 0.88,
"AUD" => 1.52,
"CAD" => 1.36,
"HKD" => 7.8,
"SGD" => 1.34,
"SEK" => 10.5,
"NOK" => 10.8,
"DKK" => 6.9,
"PLN" => 4.0,
"CZK" => 23.0,
"TRY" => 32.0,
"ZAR" => 18.5,
"ILS" => 3.7,
"RON" => 4.6,
"KWD" => 0.31,
"TWD" => 31.5,
"ISK" => 138.0,
"NZD" => 1.65,
"MXN" => 17.0,
"BRL" => 5.0,
"INR" => 83.0,
"KRW" => 1320.0,
"THB" => 35.0,
"MYR" => 4.6,
"IDR" => 15700.0,
"PHP" => 56.0,
"VND" => 24500.0,
_ => {
// Default: assume similar to USD
1.0
}
}
}
/// Collect all exchanges from company directories and create yahoo_exchanges.json
///
/// # Features
/// - Iterates through all company directories
/// - Extracts exchange data from core/data.jsonl
/// - Groups companies by exchange
/// - Sums up market caps for each exchange
/// - Converts all market caps to USD using FX rates
/// - Saves consolidated mapping to data/yahoo_exchanges.json
/// - Handles missing or invalid data gracefully
/// - Integrity tracking with content hash validation
pub async fn collect_and_save_exchanges(paths: &DataPaths) -> anyhow::Result<usize> {
let output_path = paths.data_dir().join("yahoo_exchanges.json");
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "exchange_collection_complete";
if manager.is_step_valid(step_name).await? {
logger::log_info(" Exchange collection already completed and valid").await;
// Load and count exchanges
if output_path.exists() {
let content = fs::read_to_string(&output_path).await?;
let exchanges: HashMap<String, ExchangeInfo> = serde_json::from_str(&content)?;
logger::log_info(&format!(" ✓ Found {} valid exchanges", exchanges.len())).await;
return Ok(exchanges.len());
}
}
let entry = create_exchange_collection_state_entry(&manager, &output_path, step_name).await?;
logger::log_info("Collecting exchange information from company directories...").await;
let corporate_dir = paths.corporate_dir();
if !corporate_dir.exists() {
logger::log_warn(" Corporate directory does not exist").await;
return Ok(0);
}
// Load FX rates for currency conversion
logger::log_info("Loading FX rates for currency conversion...").await;
let fx_cache = FxRateCache::new(paths).await?;
// Map of exchange code -> ExchangeInfo
let mut exchanges: HashMap<String, ExchangeInfo> = HashMap::new();
let mut entries = fs::read_dir(&corporate_dir).await?;
let mut processed_count = 0;
let mut skipped_count = 0;
while let Some(entry) = entries.next_entry().await? {
let company_path = entry.path();
if !company_path.is_dir() {
continue;
}
let company_name = match company_path.file_name().and_then(|n| n.to_str()) {
Some(name) => name.to_string(),
None => {
skipped_count += 1;
continue;
}
};
// Read core/data.jsonl
let core_data_path = company_path.join("core").join("data.jsonl");
if !core_data_path.exists() {
skipped_count += 1;
continue;
}
// Parse core data
match extract_exchange_info(&core_data_path, &company_name).await {
Ok(Some((exchange_code, exchange_name, currency, currency_symbol, delay, market_cap))) => {
// Convert market cap to USD
let market_cap_usd = fx_cache.to_usd(market_cap, &currency);
// Add or update exchange entry
exchanges
.entry(exchange_code.clone())
.and_modify(|info| {
// Add company to existing exchange and sum market caps
info.companies.push(company_name.clone());
info.total_market_cap = info.total_market_cap.saturating_add(market_cap);
info.total_market_cap_usd += market_cap_usd;
})
.or_insert_with(|| {
// Create new exchange entry
ExchangeInfo {
exchange_name,
currency,
currency_symbol,
exchange_data_delayed_by: delay,
total_market_cap: market_cap,
total_market_cap_usd: market_cap_usd,
companies: vec![company_name.clone()],
}
});
processed_count += 1;
}
Ok(None) => {
// No exchange data found
skipped_count += 1;
}
Err(e) => {
logger::log_warn(&format!(
" Failed to parse exchange data for {}: {}",
company_name, e
)).await;
skipped_count += 1;
}
}
// Progress logging every 100 companies
if (processed_count + skipped_count) % 100 == 0 {
logger::log_info(&format!(
" Progress: {} companies processed, {} skipped",
processed_count, skipped_count
)).await;
}
}
logger::log_info(&format!(
" ✓ Collected data from {} companies ({} skipped)",
processed_count, skipped_count
)).await;
logger::log_info(&format!(
" ✓ Found {} unique exchanges",
exchanges.len()
)).await;
// Sort companies within each exchange for consistency
for exchange_info in exchanges.values_mut() {
exchange_info.companies.sort();
}
// Save to yahoo_exchanges.json
save_exchanges_json(&output_path, &exchanges).await?;
logger::log_info(&format!(
" ✓ Saved exchange mapping to {}",
output_path.display()
)).await;
manager.mark_valid(entry).await?;
logger::log_info(" ✓ Exchange collection marked as complete with integrity tracking").await;
// Print summary statistics
print_exchange_statistics(&exchanges, &fx_cache).await;
Ok(exchanges.len())
}
/// Track exchange collection completion with content hash verification
async fn create_exchange_collection_state_entry(
manager: &StateManager,
output_path: &std::path::Path,
step_name: &str,
) -> anyhow::Result<StateEntry> {
// Create content reference for the output file
let content_reference = file_reference(output_path);
// Track completion with:
// - Content reference: The yahoo_exchanges.json file
// - Data stage: Data (7-day TTL by default)
// - Dependencies: None (this is a collection step, not dependent on other tracked steps)
// Note: In practice, it depends on core data, but we track the output file
// which will change if core data changes, so explicit dependency not needed
Ok(manager.create_entry(
step_name.to_string(),
content_reference,
DataStage::Data,
).await?)
}
/// Extract exchange information from a company's core data file
async fn extract_exchange_info(
core_data_path: &std::path::Path,
company_name: &str,
) -> anyhow::Result<Option<(String, String, String, String, i64, u64)>> {
let content = fs::read_to_string(core_data_path).await?;
// Parse JSONL - should be single line
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
match serde_json::from_str::<CompanyCoreData>(line) {
Ok(data) => {
// Extract from modules.price
let price_module = match data.modules.and_then(|m| m.price) {
Some(p) => p,
None => return Ok(None),
};
// Extract required fields
let exchange = match price_module.exchange {
Some(e) if !e.is_empty() => e,
_ => return Ok(None),
};
// Filter out invalid placeholder exchange codes
if exchange == "CCC" {
return Ok(None);
}
let exchange_name = price_module.exchange_name.unwrap_or_else(|| exchange.clone());
let currency = price_module.currency.unwrap_or_else(|| "USD".to_string());
let currency_symbol = price_module.currency_symbol.unwrap_or_else(|| "$".to_string());
let delay = price_module.exchange_data_delayed_by.unwrap_or(0);
let market_cap = price_module
.market_cap
.and_then(|mc| mc.raw)
.unwrap_or(0);
return Ok(Some((
exchange,
exchange_name,
currency,
currency_symbol,
delay,
market_cap,
)));
}
Err(e) => {
// Try to parse as generic JSON to check if exchange field exists in modules.price
if let Ok(json) = serde_json::from_str::<serde_json::Value>(line) {
// Try to access modules.price.exchange
if let Some(price) = json.get("modules").and_then(|m| m.get("price")) {
if let Some(exchange) = price.get("exchange").and_then(|v| v.as_str()) {
if !exchange.is_empty() && exchange != "CCC" {
let exchange_name = price
.get("exchangeName")
.and_then(|v| v.as_str())
.unwrap_or(exchange)
.to_string();
let currency = price
.get("currency")
.and_then(|v| v.as_str())
.unwrap_or("USD")
.to_string();
let currency_symbol = price
.get("currencySymbol")
.and_then(|v| v.as_str())
.unwrap_or("$")
.to_string();
let delay = price
.get("exchangeDataDelayedBy")
.and_then(|v| v.as_i64())
.unwrap_or(0);
let market_cap = price
.get("marketCap")
.and_then(|mc| mc.get("raw"))
.and_then(|v| v.as_u64())
.unwrap_or(0);
return Ok(Some((
exchange.to_string(),
exchange_name,
currency,
currency_symbol,
delay,
market_cap,
)));
}
}
}
}
return Err(anyhow::anyhow!(
"Failed to parse core data for {}: {}",
company_name,
e
));
}
}
}
Ok(None)
}
/// Save exchanges map to JSON file with fsync
async fn save_exchanges_json(
path: &std::path::Path,
exchanges: &HashMap<String, ExchangeInfo>,
) -> anyhow::Result<()> {
// Create sorted output for consistency
let mut sorted_exchanges: Vec<_> = exchanges.iter().collect();
sorted_exchanges.sort_by_key(|(code, _)| code.as_str());
let exchanges_map: HashMap<String, ExchangeInfo> = sorted_exchanges
.into_iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
// Serialize with pretty printing
let json_content = serde_json::to_string_pretty(&exchanges_map)?;
// Write to temporary file first (atomic write pattern)
let tmp_path = path.with_extension("json.tmp");
let mut file = fs::File::create(&tmp_path).await?;
file.write_all(json_content.as_bytes()).await?;
file.write_all(b"\n").await?;
file.flush().await?;
file.sync_all().await?;
// Atomic rename
fs::rename(&tmp_path, path).await?;
Ok(())
}
/// Format market cap as a human-readable string
fn format_market_cap(market_cap: f64) -> String {
if market_cap >= 1_000_000_000_000.0 {
format!("{:.2}T", market_cap / 1_000_000_000_000.0)
} else if market_cap >= 1_000_000_000.0 {
format!("{:.2}B", market_cap / 1_000_000_000.0)
} else if market_cap >= 1_000_000.0 {
format!("{:.2}M", market_cap / 1_000_000.0)
} else if market_cap >= 1_000.0 {
format!("{:.2}K", market_cap / 1_000.0)
} else {
format!("{:.2}", market_cap)
}
}
/// Print statistics about collected exchanges
async fn print_exchange_statistics(exchanges: &HashMap<String, ExchangeInfo>, fx_cache: &FxRateCache) {
logger::log_info("Exchange Statistics (sorted by USD market cap):").await;
// Sort by total market cap in USD (descending)
let mut exchange_list: Vec<_> = exchanges.iter().collect();
exchange_list.sort_by(|a, b| {
b.1.total_market_cap_usd
.partial_cmp(&a.1.total_market_cap_usd)
.unwrap_or(std::cmp::Ordering::Equal)
});
// Print top 20 exchanges by total market cap (USD)
logger::log_info(" Top 20 exchanges by total market cap (USD):").await;
for (i, (code, info)) in exchange_list.iter().take(20).enumerate() {
let (normalized_currency, factor) = normalize_currency(&info.currency);
let fx_rate = fx_cache.get_rate(&info.currency);
let fx_info = match fx_rate {
Some(rate) => {
if factor > 1.0 {
// Show conversion for pence/cents
format!(" (1 {} = {} {}, {} {} = 1 {})",
normalized_currency,
format!("{:.4}", rate),
"USD",
factor as i32,
info.currency,
normalized_currency)
} else {
format!(" (1 USD = {:.4} {})", rate, info.currency)
}
}
None => format!(" (using fallback rate for {})", info.currency),
};
logger::log_info(&format!(
" {}. {} ({}) - ${} USD ({}{} {}) - {} companies{}",
i + 1,
info.exchange_name,
code,
format_market_cap(info.total_market_cap_usd),
info.currency_symbol,
format_market_cap(info.total_market_cap as f64),
info.currency,
info.companies.len(),
if info.currency != "USD" { &fx_info } else { "" }
)).await;
}
// Count by currency
let mut currency_counts: HashMap<String, usize> = HashMap::new();
let mut currency_market_caps: HashMap<String, f64> = HashMap::new();
for info in exchanges.values() {
*currency_counts.entry(info.currency.clone()).or_insert(0) += info.companies.len();
*currency_market_caps.entry(info.currency.clone()).or_insert(0.0) += info.total_market_cap_usd;
}
let mut currencies: Vec<_> = currency_counts.iter().collect();
currencies.sort_by(|a, b| {
currency_market_caps.get(b.0)
.unwrap_or(&0.0)
.partial_cmp(currency_market_caps.get(a.0).unwrap_or(&0.0))
.unwrap_or(std::cmp::Ordering::Equal)
});
logger::log_info(" Market cap by currency (USD equivalent):").await;
for (currency, count) in currencies.iter().take(10) {
let market_cap_usd = currency_market_caps.get(*currency).unwrap_or(&0.0);
let (normalized_currency, factor) = normalize_currency(currency);
let fx_rate = fx_cache.get_rate(currency);
let fx_info = match fx_rate {
Some(rate) => {
if factor > 1.0 {
format!(" (1 {} = {:.4} USD, {} {} = 1 {})",
normalized_currency, rate, factor as i32, currency, normalized_currency)
} else {
format!(" (1 USD = {:.4} {})", rate, currency)
}
}
None => format!(" (fallback)"),
};
logger::log_info(&format!(
" {}: {} companies, ${} USD{}",
currency,
count,
format_market_cap(*market_cap_usd),
if *currency != "USD" { &fx_info } else { "" }
)).await;
}
// Delay statistics
let delayed_exchanges: Vec<_> = exchanges
.iter()
.filter(|(_, info)| info.exchange_data_delayed_by > 0)
.collect();
if !delayed_exchanges.is_empty() {
logger::log_info(&format!(
" Exchanges with data delay: {} (out of {})",
delayed_exchanges.len(),
exchanges.len()
)).await;
}
// Total market cap across all exchanges (in USD)
let total_market_cap_usd: f64 = exchanges.values()
.map(|info| info.total_market_cap_usd)
.sum();
logger::log_info(&format!(
" Total market cap across all exchanges: ${} USD",
format_market_cap(total_market_cap_usd)
)).await;
}
/// Get exchange information for a specific exchange code
pub async fn get_exchange_info(
paths: &DataPaths,
exchange_code: &str,
) -> anyhow::Result<Option<ExchangeInfo>> {
let exchanges_path = paths.data_dir().join("yahoo_exchanges.json");
if !exchanges_path.exists() {
return Ok(None);
}
let content = fs::read_to_string(&exchanges_path).await?;
let exchanges: HashMap<String, ExchangeInfo> = serde_json::from_str(&content)?;
Ok(exchanges.get(exchange_code).cloned())
}
/// List all available exchanges
pub async fn list_all_exchanges(paths: &DataPaths) -> anyhow::Result<Vec<(String, ExchangeInfo)>> {
let exchanges_path = paths.data_dir().join("yahoo_exchanges.json");
if !exchanges_path.exists() {
return Ok(Vec::new());
}
let content = fs::read_to_string(&exchanges_path).await?;
let exchanges: HashMap<String, ExchangeInfo> = serde_json::from_str(&content)?;
let mut exchange_list: Vec<_> = exchanges.into_iter().collect();
exchange_list.sort_by(|a, b| a.0.cmp(&b.0));
Ok(exchange_list)
}

View File

@@ -1,51 +0,0 @@
// src/corporate/fx.rs
use std::collections::HashMap;
use reqwest;
use serde_json::Value;
use tokio::fs;
use std::path::Path;
static FX_CACHE_PATH: &str = "fx_rates.json";
pub async fn get_usd_rate(currency: &str) -> anyhow::Result<f64> {
if currency == "USD" {
return Ok(1.0);
}
let mut cache: HashMap<String, (f64, String)> = if Path::new(FX_CACHE_PATH).exists() {
let content = fs::read_to_string(FX_CACHE_PATH).await?;
serde_json::from_str(&content).unwrap_or_default()
} else {
HashMap::new()
};
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
if let Some((rate, date)) = cache.get(currency) {
if date == &today {
return Ok(*rate);
}
}
let symbol = format!("{}USD=X", currency);
let url = format!("https://query1.finance.yahoo.com/v8/finance/chart/{}?range=1d&interval=1d", symbol);
let json: Value = reqwest::Client::new()
.get(&url)
.header("User-Agent", "Mozilla/5.0")
.send()
.await?
.json()
.await?;
let close = json["chart"]["result"][0]["meta"]["regularMarketPrice"]
.as_f64()
.or_else(|| json["chart"]["result"][0]["indicators"]["quote"][0]["close"][0].as_f64())
.unwrap_or(1.0);
let rate = if currency == "JPY" || currency == "KRW" { close } else { 1.0 / close }; // inverse pairs
cache.insert(currency.to_string(), (rate, today.clone()));
let _ = fs::write(FX_CACHE_PATH, serde_json::to_string_pretty(&cache)?).await;
Ok(rate)
}

View File

@@ -1,22 +1,25 @@
// src/corporate/helpers.rs // src/corporate/helpers.rs
use super::types::*; use super::types::*;
use crate::util::directories::DataPaths;
use chrono::{Local, NaiveDate}; use chrono::{Local, NaiveDate};
use std::collections::{HashMap, HashSet};
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::prelude::{Rng, SeedableRng, IndexedRandom}; use rand::prelude::{Rng, SeedableRng, IndexedRandom};
use tokio::fs;
use anyhow::{anyhow};
pub fn event_key(e: &CompanyEvent) -> String { pub fn event_key(e: &CompanyEventData) -> String {
format!("{}|{}|{}", e.ticker, e.date, e.time) format!("{}|{}|{}", e.ticker, e.date, e.time)
} }
pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Vec<CompanyEventChange> { pub fn detect_changes(old: &CompanyEventData, new: &CompanyEventData, today: &str) -> Vec<CompanyEventChangeData> {
let mut changes = Vec::new(); let mut changes = Vec::new();
let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string(); let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
if new.date.as_str() <= today { return changes; } if new.date.as_str() <= today { return changes; }
if old.time != new.time { if old.time != new.time {
changes.push(CompanyEventChange { changes.push(CompanyEventChangeData {
ticker: new.ticker.clone(), ticker: new.ticker.clone(),
date: new.date.clone(), date: new.date.clone(),
field_changed: "time".to_string(), field_changed: "time".to_string(),
@@ -27,7 +30,7 @@ pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Ve
} }
if old.eps_forecast != new.eps_forecast { if old.eps_forecast != new.eps_forecast {
changes.push(CompanyEventChange { changes.push(CompanyEventChangeData {
ticker: new.ticker.clone(), ticker: new.ticker.clone(),
date: new.date.clone(), date: new.date.clone(),
field_changed: "eps_forecast".to_string(), field_changed: "eps_forecast".to_string(),
@@ -38,7 +41,7 @@ pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Ve
} }
if old.eps_actual != new.eps_actual { if old.eps_actual != new.eps_actual {
changes.push(CompanyEventChange { changes.push(CompanyEventChangeData {
ticker: new.ticker.clone(), ticker: new.ticker.clone(),
date: new.date.clone(), date: new.date.clone(),
field_changed: "eps_actual".to_string(), field_changed: "eps_actual".to_string(),
@@ -53,14 +56,6 @@ pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Ve
changes changes
} }
pub fn price_key(p: &CompanyPrice) -> String {
if p.time.is_empty() {
format!("{}|{}", p.ticker, p.date)
} else {
format!("{}|{}|{}", p.ticker, p.date, p.time)
}
}
pub fn parse_float(s: &str) -> Option<f64> { pub fn parse_float(s: &str) -> Option<f64> {
s.replace("--", "").replace(",", "").parse::<f64>().ok() s.replace("--", "").replace(",", "").parse::<f64>().ok()
} }
@@ -74,7 +69,7 @@ pub fn parse_yahoo_date(s: &str) -> anyhow::Result<NaiveDate> {
/// Send-safe random range /// Send-safe random range
pub fn random_range(min: u64, max: u64) -> u64 { pub fn random_range(min: u64, max: u64) -> u64 {
let mut rng = StdRng::from_rng(&mut rand::rng()); let mut rng = StdRng::from_rng(&mut rand::rng());
rng.gen_range(min..max) rng.random_range(min..max)
} }
/// Send-safe random choice /// Send-safe random choice
@@ -82,3 +77,108 @@ pub fn choose_random<T: Clone>(items: &[T]) -> T {
let mut rng = StdRng::from_rng(&mut rand::rng()); let mut rng = StdRng::from_rng(&mut rand::rng());
items.choose(&mut rng).unwrap().clone() items.choose(&mut rng).unwrap().clone()
} }
/// Extract first valid Yahoo ticker from company
pub fn extract_first_yahoo_ticker(company: &CompanyData) -> Option<String> {
if let Some(isin_tickers_map) = &company.isin_tickers_map {
for tickers in isin_tickers_map.values() {
for ticker in tickers {
if ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
{
return Some(ticker.trim_start_matches("YAHOO:").to_string());
}
}
}
}
None
}
/// Sanitize company name for file system use
pub fn sanitize_company_name(name: &str) -> String {
name.replace("/", "_")
.replace("\\", "_")
.replace(":", "_")
.replace("*", "_")
.replace("?", "_")
.replace("\"", "_")
.replace("<", "_")
.replace(">", "_")
.replace("|", "_")
}
/// Load companies from JSONL file
pub async fn load_companies_from_jsonl(
path: &std::path::Path
) -> anyhow::Result<Vec<CompanyData>> {
let content = tokio::fs::read_to_string(path).await?;
let mut companies = Vec::new();
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(company) = serde_json::from_str::<CompanyData>(line) {
companies.push(company);
}
}
Ok(companies)
}
pub async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
if !map_cache_dir.exists() {
return Ok(None);
}
let mut entries = tokio::fs::read_dir(&map_cache_dir).await?;
let mut dates = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
dates.push((name.to_string(), path));
}
}
}
}
if dates.is_empty() {
return Ok(None);
}
dates.sort_by(|a, b| b.0.cmp(&a.0));
Ok(Some(dates[0].1.clone()))
}
pub async fn determine_gleif_date(
gleif_date: Option<&str>,
paths: &DataPaths,
) -> anyhow::Result<String> {
if let Some(d) = gleif_date {
return Ok(d.to_string());
}
let gleif_dir = paths.cache_gleif_dir();
let mut entries = fs::read_dir(gleif_dir).await?;
let mut dates = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
dates.push(name.to_string());
}
}
}
}
dates.sort();
dates.last().cloned().ok_or_else(|| anyhow!("No GLEIF date found"))
}

View File

@@ -2,14 +2,20 @@
pub mod types; pub mod types;
pub mod scraper; pub mod scraper;
pub mod storage; pub mod storage;
pub mod update;
pub mod helpers; pub mod helpers;
pub mod aggregation; pub mod update_openfigi;
pub mod fx; pub mod yahoo_company_extraction;
pub mod openfigi;
pub mod yahoo;
pub mod update_parallel;
pub mod page_validation; pub mod page_validation;
pub mod atomic_writer; pub mod checkpoint_helpers;
// Corporate update modules
pub mod update;
pub mod update_companies;
pub mod update_companies_cleanse;
pub mod update_companies_enrich;
pub mod collect_exchanges;
pub mod bond_processing;
pub mod option_processing;
pub use update::run_full_update; pub use update::run_full_update;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
/// Parse strike price from option ticker (e.g., "AAPL 150 CALL" -> 150.0)
pub fn parse_strike_from_ticker(ticker: &str) -> Option<f64> {
let parts: Vec<&str> = ticker.split_whitespace().collect();
for (i, part) in parts.iter().enumerate() {
if let Ok(strike) = part.parse::<f64>() {
// Check if next word is CALL/PUT to confirm this is strike
if i + 1 < parts.len() && (parts[i + 1].to_uppercase() == "CALL" || parts[i + 1].to_uppercase() == "PUT") {
return Some(strike);
}
}
}
None
}
/// Parse expiration date from option ticker (e.g., "AAPL 150 CALL 01/17/25" -> timestamp)
pub fn parse_expiration_from_ticker(ticker: &str) -> Option<i64> {
let parts: Vec<&str> = ticker.split_whitespace().collect();
for part in parts {
// Look for date pattern MM/DD/YY
if part.contains('/') && part.len() >= 8 {
if let Ok(date) = chrono::NaiveDate::parse_from_str(part, "%m/%d/%y") {
return Some(date.and_hms_opt(16, 0, 0)?.and_utc().timestamp());
}
}
}
None
}
/// Parse option name to extract underlying company, issuer, and option type
///
/// Examples:
/// - "December 25 Calls on ALPHA GA" -> ("ALPHA GA", None, "call")
/// - "January 26 Puts on TESLA INC" -> ("TESLA INC", None, "put")
pub fn parse_option_name(name: &str) -> (String, Option<String>, String) {
let name_upper = name.to_uppercase();
// Detect option type
let option_type = if name_upper.contains("CALL") {
"call".to_string()
} else if name_upper.contains("PUT") {
"put".to_string()
} else {
"unknown".to_string()
};
// Try to extract underlying after "on"
if let Some(pos) = name_upper.find(" ON ") {
let underlying = name[pos + 4..].trim().to_string();
return (underlying, None, option_type);
}
// Fallback: return entire name
(name.to_string(), None, option_type)
}

View File

@@ -1,179 +1,13 @@
// src/corporate/scraper.rs // src/corporate/scraper.rs
use super::{types::*}; use crate::{util::directories::DataPaths, util::logger};
//use crate::corporate::openfigi::OpenFigiClient;
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
use fantoccini::{Client}; use fantoccini::{Client};
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use chrono::{DateTime, Duration, NaiveDate, Utc};
use tokio::{time::{Duration as TokioDuration, sleep}};
use reqwest::Client as HttpClient;
use serde_json::{json, Value};
use zip::ZipArchive; use zip::ZipArchive;
use std::{collections::HashMap}; use std::{collections::HashMap};
use std::io::{Read}; use std::io::{Read};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
fn parse_price(v: Option<&Value>) -> f64 {
v.and_then(|x| x.as_str())
.and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
.or_else(|| v.and_then(|x| x.as_f64()))
.unwrap_or(0.0)
}
fn parse_volume(v: Option<&Value>) -> u64 {
v.and_then(|x| x.as_str())
.and_then(|s| s.replace(',', "").parse::<u64>().ok())
.or_else(|| v.and_then(|x| x.as_u64()))
.unwrap_or(0)
}
pub async fn fetch_daily_price_history(
ticker: &str,
start_str: &str,
end_str: &str,
) -> anyhow::Result<Vec<CompanyPrice>> {
let start = NaiveDate::parse_from_str(start_str, "%Y-%m-%d")?;
let end = NaiveDate::parse_from_str(end_str, "%Y-%m-%d")? + Duration::days(1);
let mut all_prices = Vec::new();
let mut current = start;
while current < end {
let chunk_end = current + Duration::days(730);
let actual_end = chunk_end.min(end);
let period1 = current.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
let period2 = actual_end.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
println!(" Fetching {ticker} {}{}", current, actual_end - Duration::days(1));
let url = format!(
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=1d&includeAdjustedClose=true"
);
let json: Value = HttpClient::new()
.get(&url)
.header("User-Agent", USER_AGENT)
.send()
.await?
.json()
.await?;
let result = &json["chart"]["result"][0];
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
let quote = &result["indicators"]["quote"][0];
let meta = &result["meta"];
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
let opens = quote["open"].as_array();
let highs = quote["high"].as_array();
let lows = quote["low"].as_array();
let closes = quote["close"].as_array();
let adj_closes = result["indicators"]["adjclose"][0]["adjclose"].as_array()
.or_else(|| closes);
let volumes = quote["volume"].as_array();
for (i, ts_val) in timestamps.iter().enumerate() {
let ts = ts_val.as_i64().unwrap_or(0);
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
let date_str = dt.format("%Y-%m-%d").to_string();
if date_str < start_str.to_string() || date_str > end_str.to_string() {
continue;
}
let open = parse_price(opens.and_then(|a| a.get(i)));
let high = parse_price(highs.and_then(|a| a.get(i)));
let low = parse_price(lows.and_then(|a| a.get(i)));
let close = parse_price(closes.and_then(|a| a.get(i)));
let adj_close = parse_price(adj_closes.and_then(|a| a.get(i)));
let volume = parse_volume(volumes.and_then(|a| a.get(i)));
all_prices.push(CompanyPrice {
ticker: ticker.to_string(),
date: date_str,
time: "".to_string(),
open,
high,
low,
close,
adj_close,
volume,
currency: currency.clone(),
});
}
sleep(TokioDuration::from_millis(200)).await;
current = actual_end;
}
all_prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
all_prices.dedup_by(|a, b| a.date == b.date && a.time == b.time);
println!(" Got {} daily bars for {ticker}", all_prices.len());
Ok(all_prices)
}
pub async fn fetch_price_history_5min(
ticker: &str,
_start: &str,
_end: &str,
) -> anyhow::Result<Vec<CompanyPrice>> {
let now = Utc::now().timestamp();
let period1 = now - 5184000;
let period2 = now;
let url = format!(
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=5m&includeAdjustedClose=true"
);
let json: Value = HttpClient::new()
.get(&url)
.header("User-Agent", USER_AGENT)
.send()
.await?
.json()
.await?;
let result = &json["chart"]["result"][0];
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
let quote = &result["indicators"]["quote"][0];
let meta = &result["meta"];
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
let mut prices = Vec::new();
for (i, ts_val) in timestamps.iter().enumerate() {
let ts = ts_val.as_i64().unwrap_or(0);
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
let date_str = dt.format("%Y-%m-%d").to_string();
let time_str = dt.format("%H:%M:%S").to_string();
let open = parse_price(quote["open"].as_array().and_then(|a| a.get(i)));
let high = parse_price(quote["high"].as_array().and_then(|a| a.get(i)));
let low = parse_price(quote["low"].as_array().and_then(|a| a.get(i)));
let close = parse_price(quote["close"].as_array().and_then(|a| a.get(i)));
let volume = parse_volume(quote["volume"].as_array().and_then(|a| a.get(i)));
prices.push(CompanyPrice {
ticker: ticker.to_string(),
date: date_str,
time: time_str,
open,
high,
low,
close,
adj_close: close,
volume,
currency: currency.clone(),
});
}
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
Ok(prices)
}
/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF /// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF
/// Overengineered; we could just use the static URL, but this shows how to scrape if needed /// Overengineered; we could just use the static URL, but this shows how to scrape if needed
pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> { pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> {

View File

@@ -1,15 +1,11 @@
// src/corporate/storage.rs // src/corporate/storage.rs
use super::{types::*, helpers::*};
use crate::util::directories::DataPaths; use crate::util::directories::DataPaths;
use crate::util::logger; use crate::util::logger;
use tokio::fs;
use tokio::io::AsyncWriteExt; use tokio::io::AsyncWriteExt;
use chrono::{Datelike, NaiveDate};
use std::collections::HashMap; use std::collections::HashMap;
use std::path::{PathBuf, Path}; use std::path::{PathBuf, Path};
const BATCH_SIZE: usize = 500;
/// Lightweight index entry - only metadata, no full event data /// Lightweight index entry - only metadata, no full event data
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@@ -20,258 +16,6 @@ pub struct EventIndex {
pub file_path: PathBuf, pub file_path: PathBuf,
} }
/// Build index of all events without loading them into memory
pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventIndex>> {
let dir = paths.corporate_events_dir();
if !dir.exists() {
logger::log_info("Corporate Storage: No events directory found").await;
return Ok(Vec::new());
}
let mut index = Vec::new();
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("json") {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") && name.len() == 17 {
let content = fs::read_to_string(&path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
for event in events {
index.push(EventIndex {
key: event_key(&event),
ticker: event.ticker.clone(),
date: event.date.clone(),
file_path: path.clone(),
});
}
}
}
}
logger::log_info(&format!("Corporate Storage: Built index with {} entries", index.len())).await;
Ok(index)
}
/// Load specific event by key (only loads its file)
pub async fn lookup_event_by_key(
key: &str,
index: &[EventIndex]
) -> anyhow::Result<Option<CompanyEvent>> {
let entry = index.iter().find(|e| e.key == key);
if let Some(entry) = entry {
let content = fs::read_to_string(&entry.file_path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
Ok(events.into_iter().find(|e| event_key(e) == key))
} else {
Ok(None)
}
}
/// Stream events file by file with callback
pub async fn stream_events_with_callback<F>(
paths: &DataPaths,
mut callback: F
) -> anyhow::Result<usize>
where
F: FnMut(CompanyEvent) -> anyhow::Result<()>,
{
let dir = paths.corporate_events_dir();
if !dir.exists() {
return Ok(0);
}
let mut total = 0;
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("json") {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") {
let content = fs::read_to_string(&path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
for event in events {
callback(event)?;
total += 1;
}
tokio::task::yield_now().await;
}
}
}
logger::log_info(&format!("Corporate Storage: Streamed {} events", total)).await;
Ok(total)
}
/// Save events organized by month (accepts Vec, not HashMap)
pub async fn save_optimized_events(
paths: &DataPaths,
events: Vec<CompanyEvent>
) -> anyhow::Result<()> {
let dir = paths.corporate_events_dir();
fs::create_dir_all(dir).await?;
logger::log_info("Corporate Storage: Removing old event files...").await;
let mut removed_count = 0;
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
fs::remove_file(&path).await?;
removed_count += 1;
}
}
logger::log_info(&format!("Corporate Storage: Removed {} old files", removed_count)).await;
let total_events = events.len();
let mut sorted = events;
sorted.sort_by(|a, b| {
a.ticker.cmp(&b.ticker).then(a.date.cmp(&b.date))
});
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
for chunk in sorted.chunks(BATCH_SIZE) {
for e in chunk {
if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") {
let key = format!("{}-{:02}", d.year(), d.month());
by_month.entry(key).or_default().push(e.clone());
}
}
tokio::task::yield_now().await;
}
for (month, list) in by_month {
let path = dir.join(format!("events_{}.json", month));
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
logger::log_info(&format!("Saved {} events for month {}", list.len(), month)).await;
}
logger::log_info(&format!("Saved {} total events", total_events)).await;
Ok(())
}
pub async fn save_changes(
paths: &DataPaths,
changes: &[CompanyEventChange]
) -> anyhow::Result<()> {
if changes.is_empty() {
logger::log_info("Corporate Storage: No changes to save").await;
return Ok(());
}
let dir = paths.corporate_changes_dir();
fs::create_dir_all(dir).await?;
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
for c in changes {
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
let key = format!("{}-{:02}", d.year(), d.month());
by_month.entry(key).or_default().push(c.clone());
}
}
for (month, list) in by_month {
let path = dir.join(format!("changes_{}.json", month));
let mut all = if path.exists() {
let s = fs::read_to_string(&path).await?;
serde_json::from_str(&s).unwrap_or_default()
} else {
vec![]
};
all.extend(list.clone());
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
}
Ok(())
}
pub async fn save_prices_for_ticker(
paths: &DataPaths,
ticker: &str,
timeframe: &str,
mut prices: Vec<CompanyPrice>
) -> anyhow::Result<()> {
let base_dir = paths.corporate_prices_dir();
let company_dir = base_dir.join(ticker.replace(".", "_"));
let timeframe_dir = company_dir.join(timeframe);
fs::create_dir_all(&timeframe_dir).await?;
let path = timeframe_dir.join("prices.json");
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
Ok(())
}
pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf {
paths.corporate_prices_dir().join(lei)
}
pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Result<()> {
let base = get_company_dir(paths, isin);
let paths_to_create = [
base.clone(),
base.join("5min"),
base.join("daily"),
base.join("aggregated").join("5min"),
base.join("aggregated").join("daily"),
];
for p in paths_to_create {
fs::create_dir_all(&p).await?;
}
Ok(())
}
pub async fn save_available_exchanges(
paths: &DataPaths,
isin: &str,
exchanges: Vec<AvailableExchange>
) -> anyhow::Result<()> {
let dir = get_company_dir(paths, isin);
fs::create_dir_all(&dir).await?;
let path = dir.join("available_exchanges.json");
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
Ok(())
}
pub async fn load_available_exchanges(
paths: &DataPaths,
lei: &str
) -> anyhow::Result<Vec<AvailableExchange>> {
let path = get_company_dir(paths, lei).join("available_exchanges.json");
if path.exists() {
let content = fs::read_to_string(&path).await?;
Ok(serde_json::from_str(&content)?)
} else {
Ok(vec![])
}
}
pub async fn save_prices_by_source(
paths: &DataPaths,
lei: &str,
source_ticker: &str,
timeframe: &str,
prices: Vec<CompanyPrice>,
) -> anyhow::Result<()> {
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
let dir = get_company_dir(paths, lei).join(timeframe).join(&source_safe);
fs::create_dir_all(&dir).await?;
let path = dir.join("prices.json");
let mut prices = prices;
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
Ok(())
}
/// Stream companies to JSONL incrementally /// Stream companies to JSONL incrementally
pub async fn save_companies_to_jsonl_streaming( pub async fn save_companies_to_jsonl_streaming(
paths: &DataPaths, paths: &DataPaths,

View File

@@ -2,42 +2,22 @@
use std::collections::HashMap; use std::collections::HashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyEvent { pub struct ChartData {
pub ticker: String, pub symbol: String,
pub date: String, // YYYY-MM-DD pub quotes: Vec<Quote>,
pub time: String, // "AMC", "BMO", "TAS", or "" pub timestamp: i64,
pub period: String, // "Q1 2025", "FY 2024"
pub eps_forecast: Option<f64>,
pub eps_actual: Option<f64>,
pub revenue_forecast: Option<f64>,
pub revenue_actual: Option<f64>,
pub surprise_pct: Option<f64>, // (actual - forecast) / |forecast|
pub source: String, // "Yahoo"
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyPrice { pub struct Quote {
pub ticker: String, pub timestamp: i64,
pub date: String, // YYYY-MM-DD pub open: Option<f64>,
pub time: String, // HH:MM:SS for intraday, "" for daily pub high: Option<f64>,
pub open: f64, pub low: Option<f64>,
pub high: f64, pub close: Option<f64>,
pub low: f64, pub volume: Option<u64>,
pub close: f64, pub adjusted_close: Option<f64>,
pub adj_close: f64,
pub volume: u64,
pub currency: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyEventChange {
pub ticker: String,
pub date: String,
pub field_changed: String, // "time", "eps_forecast", "eps_actual", "new_event"
pub old_value: String,
pub new_value: String,
pub detected_at: String,
} }
/// Figi Info based on API calls [https://www.openfigi.com/] /// Figi Info based on API calls [https://www.openfigi.com/]
@@ -47,7 +27,7 @@ pub struct CompanyEventChange {
/// # Comments /// # Comments
/// Use Mapping the Object List onto Figi Properties /// Use Mapping the Object List onto Figi Properties
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FigiInfo { pub struct FigiData {
pub isin: String, pub isin: String,
pub figi: String, pub figi: String,
pub name: String, pub name: String,
@@ -69,71 +49,144 @@ pub struct FigiInfo {
/// Company Info /// Company Info
/// # Attributes /// # Attributes
/// * Name as primary key (for one instition) -> might have to changed when first FigiInfo is coming in /// * Name as primary key (for one institution) -> might have to changed when first FigiInfo is coming in
/// * ISIN as the most liquid / preferred traded security (used for fallback) /// * ISIN as the most liquid / preferred traded security (used for fallback)
/// * securities: Grouped by ISIN, filtered for Common Stock only /// * securities: Grouped by ISIN, filtered for Common Stock only
/// * isin_tickers_map: Map of ISINs to their associated tickers across platforms
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyInfo{ pub struct CompanyData{
pub name: String, pub name: String,
pub primary_isin: String, pub primary_isin: String,
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> pub securities: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo>
pub yahoo_company_data: Option<Vec<YahooCompanyData>>,
pub isin_tickers_map: Option<HashMap<String, Vec<String>>>, // ISIN -> Tickers
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct YahooCompanyDetails { pub struct YahooCompanyData {
pub ticker: String, pub ticker: String,
pub sector: Option<String>, pub sector: Option<String>,
pub exchange: Option<String>, pub exchange: Option<String>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyCrossPlatformInfo { pub struct WarrantData {
pub name: String, pub company_name: String, // key in CompanyData
pub isin_tickers_map: HashMap<String, Vec<String>>, // ISIN -> Tickers pub warrants: HashMap<String, WarrantDetails>, // underlying company name -> Warrant
pub sector: Option<String>,
pub exchange: Option<String>,
} }
/// Warrant Info /// Warrant Data
/// ///
/// Information for Warrant securities fetched out of Name in FigiInfo /// Information for Warrant securities fetched out of Name in FigiData
/// example1: "name": "VONTOBE-PW26 LEONARDO SPA", /// example1: "name": "VONTOBE-PW26 LEONARDO SPA",
/// issued by VONTOBEL Put Warrant for underlying company LEONARDO SPA /// issued by VONTOBEL Put Warrant for underlying company LEONARDO SPA
/// example2: "BAYER H-CW25 L'OREAL", /// example2: "BAYER H-CW25 L'OREAL",
/// other formats like only on company instead of two, underlying and issuing company are the same, leave issuer_company_name NULL /// other formats like only on company instead of two, underlying and issuing company are the same, leave issuer_company_name NULL
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WarrantInfo { pub struct WarrantDetails {
pub underlying_company_name: String, // key in CompanyInfo, key for WarrantInfo pub company_name: String, // key in CompanyData, key for WarrantDetails
pub issuer_company_name: Option<String>, // key in CompanyInfo pub issuer_company_name: Option<String>, // key in CompanyData
pub warrant_type: String, // "put" or "call" pub warrant_type: String, // "put" or "call"
pub warrants: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN) pub warrants: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiData> (grouped by ISIN)
} }
/// Option Info #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptionData {
pub company_name: String, // key in CompanyData
pub expiration_dates: Vec<i64>,
pub strikes: Vec<f64>,
pub option: Vec<OptionChain>,
pub timestamp: i64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptionChain {
pub expiration_date: i64,
pub calls: Vec<OptionContract>,
pub puts: Vec<OptionContract>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptionContract {
pub strike: f64,
pub last_price: Option<f64>,
pub bid: Option<f64>,
pub ask: Option<f64>,
pub volume: Option<u64>,
pub open_interest: Option<u64>,
pub implied_volatility: Option<f64>,
}
/// Bond parsed details from ticker/description
/// ///
/// Information for Option securities fetched out of Name in FigiInfo /// Parses bond information from ticker format:
/// example1: "name": "December 25 Calls on ALPHA GA", /// Corporate: "WTFC 4.3 01/12/26 0003"
/// issued by NULL Call Option for underlying company ALPHA GA /// Government: "SLOVAK 1.5225 05/10/28 4Y"
/// other formats like only on company instead of two, underlying and issuing company are the same, leave issuer_company_name NULL
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptionInfo { pub struct BondDetails {
pub underlying_company_name: String, // key in CompanyInfo, key for OptionInfo pub coupon_rate: Option<f64>, // 4.3, 1.5225
pub issuer_company_name: Option<String>, // key in CompanyInfo pub maturity_date: Option<String>, // "2026-01-12", "2028-05-10"
pub option_type: String, // "put" or "call" pub is_floating: bool, // true if "Float" in description
pub options: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN) pub is_zero_coupon: bool, // true if coupon is 0
pub tenor_years: Option<u32>, // Parsed from maturity or inferred
pub series_identifier: Option<String>, // "0003", "4Y", "144A", "REGS", etc.
}
/// Corporate Bond Info
///
/// Information for corporate bonds grouped by issuer
/// Example: "name": "LIBERTYVILLE BK & TRUST"
/// ticker: "WTFC 4.3 01/12/26 0003"
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CorporateBondData {
pub underlying_company_name: String, // key - company name issuing the bond
pub bonds: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
#[serde(skip_serializing_if = "HashMap::is_empty", default)]
pub bond_details: HashMap<String, BondDetails>, // ISIN -> parsed bond details
}
/// Government Bond Info
///
/// Information for government bonds grouped by issuer (country/municipality)
/// Example: "name": "SLOVAK REPUBLIC"
/// ticker: "SLOVAK 1.5225 05/10/28 4Y"
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GovernmentBondData {
pub issuer_name: String, // key - government entity name
pub issuer_type: String, // "sovereign", "municipal", "state", "province", etc.
pub bonds: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
#[serde(skip_serializing_if = "HashMap::is_empty", default)]
pub bond_details: HashMap<String, BondDetails>, // ISIN -> parsed bond details
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AvailableExchange { pub struct ExchangeData {
pub exchange_mic: String, pub mic: String,
pub ticker: String, pub ticker: String,
pub has_daily: bool,
pub has_5min: bool,
pub last_successful_fetch: Option<String>, // YYYY-MM-DD
#[serde(default)] #[serde(default)]
pub currency: String, pub currency: String,
#[serde(default)] }
pub discovered_at: Option<String>, // When this exchange was first discovered
#[serde(default)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub fetch_count: u32, // How many times successfully fetched pub struct CompanyEventData {
pub ticker: String,
pub date: String, // YYYY-MM-DD
pub time: String, // "AMC", "BMO", "TAS", or ""
pub period: String, // "Q1 2025", "FY 2024"
pub eps_forecast: Option<f64>,
pub eps_actual: Option<f64>,
pub revenue_forecast: Option<f64>,
pub revenue_actual: Option<f64>,
pub surprise_pct: Option<f64>, // (actual - forecast) / |forecast|
pub source: String, // "Yahoo"
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyEventChangeData {
pub ticker: String,
pub date: String,
pub field_changed: String, // "time", "eps_forecast", "eps_actual", "new_event"
pub old_value: String,
pub new_value: String,
pub detected_at: String,
} }

View File

@@ -1,26 +1,34 @@
// src/corporate/update.rs - UPDATED WITH DATA INTEGRITY FIXES // src/corporate/update.rs
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*}; use super::{scraper::*, update_openfigi::*};
use crate::config::Config; use crate::config::Config;
use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel; use crate::check_shutdown;
use crate::corporate::update_companies::update_companies;
use crate::corporate::update_companies_cleanse::{companies_yahoo_cleansed_low_profile, companies_yahoo_cleansed_no_data};
use crate::corporate::update_companies_enrich::{enrich_companies_with_events, enrich_companies_with_chart, enrich_companies_with_option};
use crate::corporate::collect_exchanges::collect_and_save_exchanges;
use crate::economic::yahoo_update_forex::collect_fx_rates;
use crate::util::directories::DataPaths; use crate::util::directories::DataPaths;
use crate::util::logger; use crate::util::logger;
use crate::scraper::webdriver::ChromeDriverPool; use crate::scraper::webdriver::ChromeDriverPool;
use crate::scraper::yahoo::{YahooClientPool};
use crate::scraper::openfigi::load_figi_type_lists;
use chrono::Local; use std::result::Result::Ok;
use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool};
/// UPDATED: Main corporate update entry point with shutdown awareness /// Main corporate update entry point with shutdown awareness
pub async fn run_full_update( pub async fn run_full_update(
_config: &Config, config: &Config,
pool: &Arc<ChromeDriverPool>, pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>, shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
logger::log_info("=== Corporate Update (STREAMING MODE WITH DATA INTEGRITY) ===").await; logger::log_info("=== Corporate Update ===").await;
let paths = DataPaths::new(".")?; let paths = DataPaths::new(".")?;
check_shutdown!(shutdown_flag);
logger::log_info("Step 1: Downloading GLEIF CSV...").await; logger::log_info("Step 1: Downloading GLEIF CSV...").await;
let gleif_csv_path = match download_isin_lei_csv().await? { let gleif_csv_path = match download_isin_lei_csv().await? {
Some(p) => { Some(p) => {
@@ -33,22 +41,16 @@ pub async fn run_full_update(
} }
}; };
if shutdown_flag.load(Ordering::SeqCst) { check_shutdown!(shutdown_flag);
logger::log_warn("Shutdown detected after GLEIF download").await;
return Ok(());
}
logger::log_info("Step 2: Loading OpenFIGI metadata...").await; logger::log_info("Step 2: Loading OpenFIGI metadata...").await;
load_figi_type_lists().await.ok(); load_figi_type_lists(&paths).await.ok();
logger::log_info(" ✓ OpenFIGI metadata loaded").await; logger::log_info(" ✓ OpenFIGI metadata loaded").await;
if shutdown_flag.load(Ordering::SeqCst) { check_shutdown!(shutdown_flag);
logger::log_warn("Shutdown detected after OpenFIGI load").await;
return Ok(());
}
logger::log_info("Step 3: Checking LEI-FIGI mapping status...").await; logger::log_info("Step 3: Checking LEI-FIGI mapping status...").await;
let all_mapped = ensure_all_leis_mapped(&gleif_csv_path, None).await?; let all_mapped = update_lei_mapping(&paths, &gleif_csv_path, None).await?;
if !all_mapped { if !all_mapped {
logger::log_warn(" ⚠ Some LEIs failed to map - continuing with partial data").await; logger::log_warn(" ⚠ Some LEIs failed to map - continuing with partial data").await;
@@ -56,534 +58,72 @@ pub async fn run_full_update(
logger::log_info(" ✓ All LEIs successfully mapped").await; logger::log_info(" ✓ All LEIs successfully mapped").await;
} }
if shutdown_flag.load(Ordering::SeqCst) { check_shutdown!(shutdown_flag);
logger::log_warn("Shutdown detected after LEI-FIGI mapping").await;
return Ok(());
}
logger::log_info("Step 4: Building securities map (streaming)...").await; logger::log_info("Step 4: Building securities map (streaming)...").await;
let date_dir = find_most_recent_figi_date_dir(&paths).await?; update_securities(&paths).await?;
logger::log_info(" ✓ Securities map updated").await;
if let Some(date_dir) = date_dir { let paths = DataPaths::new(".")?;
logger::log_info(&format!(" Using FIGI data from: {:?}", date_dir)).await;
build_securities_from_figi_streaming(&date_dir).await?;
logger::log_info(" ✓ Securities map updated").await;
} else {
logger::log_warn(" ✗ No FIGI data directory found").await;
}
if shutdown_flag.load(Ordering::SeqCst) { check_shutdown!(shutdown_flag);
logger::log_warn("Shutdown detected after securities map build").await;
return Ok(());
}
logger::log_info("Step 5: Building companies.jsonl with parallel processing and validation...").await; logger::log_info("Step 5: Building companies.jsonl with Yahoo Data...").await;
let count = build_companies_jsonl_streaming_parallel(&paths, pool, shutdown_flag).await?; let count = update_companies(&paths, pool, shutdown_flag, config, &None).await?;
logger::log_info(&format!(" ✓ Saved {} companies", count)).await; logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
if !shutdown_flag.load(Ordering::SeqCst) { check_shutdown!(shutdown_flag);
logger::log_info("Step 6: Processing events (using index)...").await;
let _event_index = build_event_index(&paths).await?;
logger::log_info(" ✓ Event index built").await;
} else {
logger::log_warn("Shutdown detected, skipping event index build").await;
}
logger::log_info("✓ Corporate update complete").await; logger::log_info("Step 6: Cleansing companies with missing essential data...").await;
let cleansed_count = companies_yahoo_cleansed_no_data(&paths).await?;
logger::log_info(&format!("{} companies found on Yahoo ready for further use in companies_yahoo.jsonl", cleansed_count)).await;
check_shutdown!(shutdown_flag);
let proxy_pool = pool.get_proxy_pool()
.ok_or_else(|| anyhow::anyhow!("ChromeDriverPool must be created with VPN proxy rotation enabled"))?;
logger::log_info("Creating YahooClientPool with proxy rotation...").await;
let yahoo_pool = Arc::new(YahooClientPool::new(proxy_pool, config, None).await?);
logger::log_info(&format!("✓ YahooClientPool ready with {} clients", yahoo_pool.num_clients().await)).await;
check_shutdown!(shutdown_flag);
logger::log_info("Step 7: Cleansing companies with too low profile (with abort-safe persistence)...").await;
let cleansed_count = companies_yahoo_cleansed_low_profile(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
logger::log_info(&format!("{} companies with sufficient profile ready for analytics", cleansed_count)).await;
check_shutdown!(shutdown_flag);
logger::log_info("Step 8: Enriching companies with Yahoo Events (with abort-safe persistence)...").await;
let enriched_count = enrich_companies_with_events(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
logger::log_info(&format!("{} companies enriched with event data", enriched_count)).await;
check_shutdown!(shutdown_flag);
logger::log_info("Step 9: Enriching companies with Yahoo Options (with abort-safe persistence)...").await;
let options_count = enrich_companies_with_option(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
logger::log_info(&format!("{} companies enriched with options data", options_count)).await;
check_shutdown!(shutdown_flag);
logger::log_info("Step 10: Enriching companies with Yahoo Chart (with abort-safe persistence)...").await;
let chart_count = enrich_companies_with_chart(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
logger::log_info(&format!("{} companies enriched with chart data", chart_count)).await;
check_shutdown!(shutdown_flag);
logger::log_info("Step 11: Collecting FX rates...").await;
let fx_count = collect_fx_rates(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
logger::log_info(&format!(" ✓ Collected {} FX rates", fx_count)).await;
check_shutdown!(shutdown_flag);
logger::log_info("Step 12: Collecting exchange information...").await;
let exchange_count = collect_and_save_exchanges(&paths).await?;
logger::log_info(&format!(" ✓ Collected {} exchanges", exchange_count)).await;
logger::log_info("=== Corporate update complete === ").await;
Ok(()) Ok(())
} }
/// UPDATED: Serial version with validation (kept for compatibility/debugging)
///
/// This is the non-parallel version that processes companies sequentially.
/// Updated with same validation and shutdown checks as parallel version.
///
/// Use this for:
/// - Debugging issues with specific companies
/// - Environments where parallel processing isn't desired
/// - Testing validation logic without concurrency complexity
async fn build_companies_jsonl_streaming_serial(
paths: &DataPaths,
pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
let path = DataPaths::new(".")?;
let corporate_path = path.data_dir().join("corporate").join("by_name");
let securities_path = corporate_path.join("common_stocks.json");
if !securities_path.exists() {
logger::log_warn("No common_stocks.json found").await;
return Ok(0);
}
let content = tokio::fs::read_to_string(securities_path).await?;
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
let companies_path = paths.data_dir().join("companies.jsonl");
let log_path = paths.data_dir().join("companies_updates.log");
if let Some(parent) = companies_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
// === RECOVERY PHASE: Load checkpoint + replay log ===
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
if companies_path.exists() {
logger::log_info("Loading checkpoint from companies.jsonl...").await;
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
for line in existing_content.lines() {
if line.trim().is_empty() {
continue;
}
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
}
}
}
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
}
if log_path.exists() {
logger::log_info("Replaying update log...").await;
let log_content = tokio::fs::read_to_string(&log_path).await?;
let mut replayed = 0;
for line in log_content.lines() {
if line.trim().is_empty() {
continue;
}
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
replayed += 1;
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
if replayed > 0 {
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
}
}
// === OPEN LOG FILE ===
use tokio::fs::OpenOptions;
use tokio::io::AsyncWriteExt;
let mut log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await?;
let mut writes_since_fsync = 0;
let mut last_fsync = std::time::Instant::now();
let mut updates_since_checkpoint = 0;
let mut count = 0;
let mut new_count = 0;
let mut updated_count = 0;
logger::log_info(&format!("Processing {} companies sequentially...", securities.len())).await;
// === PROCESS COMPANIES SEQUENTIALLY ===
for (name, company_info) in securities.clone() {
// Check shutdown before each company
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!(
"Shutdown detected at company: {} (progress: {}/{})",
name, count, count + securities.len()
)).await;
break;
}
let existing_entry = existing_companies.get(&name).cloned();
let is_update = existing_entry.is_some();
// Process company with validation
match process_single_company_serial(
name.clone(),
company_info,
existing_entry,
pool,
shutdown_flag,
).await {
Ok(Some(company_entry)) => {
// Write to log
let line = serde_json::to_string(&company_entry)?;
log_file.write_all(line.as_bytes()).await?;
log_file.write_all(b"\n").await?;
writes_since_fsync += 1;
// Batched + time-based fsync
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
if should_fsync {
log_file.flush().await?;
log_file.sync_data().await?;
writes_since_fsync = 0;
last_fsync = std::time::Instant::now();
}
// Update in-memory state
processed_names.insert(name.clone());
existing_companies.insert(name.clone(), company_entry);
count += 1;
updates_since_checkpoint += 1;
if is_update {
updated_count += 1;
} else {
new_count += 1;
}
// Periodic checkpoint
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
if writes_since_fsync > 0 {
log_file.flush().await?;
log_file.sync_data().await?;
writes_since_fsync = 0;
last_fsync = std::time::Instant::now();
}
logger::log_info(&format!("Creating checkpoint at {} companies...", count)).await;
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
for company in existing_companies.values() {
let line = serde_json::to_string(company)?;
checkpoint_file.write_all(line.as_bytes()).await?;
checkpoint_file.write_all(b"\n").await?;
}
checkpoint_file.flush().await?;
checkpoint_file.sync_all().await?;
drop(checkpoint_file);
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
drop(log_file);
tokio::fs::remove_file(&log_path).await.ok();
log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await?;
updates_since_checkpoint = 0;
logger::log_info("✓ Checkpoint created and log cleared").await;
}
if count % 10 == 0 {
logger::log_info(&format!(
"Progress: {} companies ({} new, {} updated)",
count, new_count, updated_count
)).await;
}
}
Ok(None) => {
// Company had no ISINs or was skipped
logger::log_info(&format!("Skipped company: {} (no ISINs)", name)).await;
}
Err(e) => {
logger::log_warn(&format!("Error processing company {}: {}", name, e)).await;
}
}
// Time-based fsync
if writes_since_fsync > 0 && last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS {
log_file.flush().await?;
log_file.sync_data().await?;
writes_since_fsync = 0;
last_fsync = std::time::Instant::now();
}
}
// === FSYNC PENDING WRITES ===
if writes_since_fsync > 0 {
logger::log_info(&format!("Fsyncing {} pending writes...", writes_since_fsync)).await;
log_file.flush().await?;
log_file.sync_data().await?;
logger::log_info("✓ Pending writes saved").await;
}
// === FINAL CHECKPOINT ===
if !shutdown_flag.load(Ordering::SeqCst) && updates_since_checkpoint > 0 {
logger::log_info("Creating final checkpoint...").await;
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
for company in existing_companies.values() {
let line = serde_json::to_string(company)?;
checkpoint_file.write_all(line.as_bytes()).await?;
checkpoint_file.write_all(b"\n").await?;
}
checkpoint_file.flush().await?;
checkpoint_file.sync_all().await?;
drop(checkpoint_file);
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
drop(log_file);
tokio::fs::remove_file(&log_path).await.ok();
logger::log_info("✓ Final checkpoint created").await;
}
logger::log_info(&format!(
"Completed: {} total companies ({} new, {} updated)",
count, new_count, updated_count
)).await;
Ok(count)
}
/// UPDATED: Process single company serially with validation
async fn process_single_company_serial(
name: String,
company_info: CompanyInfo,
existing_entry: Option<CompanyCrossPlatformInfo>,
pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<CompanyCrossPlatformInfo>> {
// Check shutdown at start
if shutdown_flag.load(Ordering::SeqCst) {
return Ok(None);
}
let mut isin_tickers_map: HashMap<String, Vec<String>> =
existing_entry
.as_ref()
.map(|e| e.isin_tickers_map.clone())
.unwrap_or_default();
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
// Collect unique ISIN-ticker pairs
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
for figi_infos in company_info.securities.values() {
for figi_info in figi_infos {
if !figi_info.isin.is_empty() {
let tickers = unique_isin_ticker_pairs
.entry(figi_info.isin.clone())
.or_insert_with(Vec::new);
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
tickers.push(figi_info.ticker.clone());
}
}
}
}
// Process each ISIN with validation
for (isin, figi_tickers) in unique_isin_ticker_pairs {
// Check shutdown before each ISIN
if shutdown_flag.load(Ordering::SeqCst) {
return Ok(None);
}
let tickers = isin_tickers_map
.entry(isin.clone())
.or_insert_with(Vec::new);
for figi_ticker in figi_tickers {
if !tickers.contains(&figi_ticker) {
tickers.push(figi_ticker);
}
}
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
if !has_yahoo_ticker {
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
// Use validated scraping with retry
match scrape_with_retry_serial(pool, &isin, 3, shutdown_flag).await {
Ok(Some(details)) => {
logger::log_info(&format!(
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
details.ticker, isin, name
)).await;
tickers.push(format!("YAHOO:{}", details.ticker));
if sector.is_none() && details.sector.is_some() {
sector = details.sector.clone();
}
if exchange.is_none() && details.exchange.is_some() {
exchange = details.exchange.clone();
}
},
Ok(None) => {
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
tickers.push("YAHOO:NO_RESULTS".to_string());
},
Err(e) => {
if shutdown_flag.load(Ordering::SeqCst) {
return Ok(None);
}
logger::log_warn(&format!(
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
isin, name, e
)).await;
}
}
}
}
// Final shutdown check
if shutdown_flag.load(Ordering::SeqCst) {
return Ok(None);
}
if !isin_tickers_map.is_empty() {
Ok(Some(CompanyCrossPlatformInfo {
name,
isin_tickers_map,
sector,
exchange,
}))
} else {
Ok(None)
}
}
/// UPDATED: Scrape with retry for serial processing
async fn scrape_with_retry_serial(
pool: &Arc<ChromeDriverPool>,
isin: &str,
max_retries: u32,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<YahooCompanyDetails>> {
let mut retries = 0;
loop {
if shutdown_flag.load(Ordering::SeqCst) {
return Err(anyhow::anyhow!("Aborted due to shutdown"));
}
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
Ok(result) => return Ok(result),
Err(e) => {
if retries >= max_retries {
return Err(e);
}
let backoff_ms = 1000 * 2u64.pow(retries);
let jitter_ms = random_range(0, 500);
let total_delay = backoff_ms + jitter_ms;
logger::log_warn(&format!(
"Retry {}/{} for ISIN {} after {}ms: {}",
retries + 1, max_retries, isin, total_delay, e
)).await;
tokio::time::sleep(tokio::time::Duration::from_millis(total_delay)).await;
retries += 1;
}
}
}
}
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
if !map_cache_dir.exists() {
return Ok(None);
}
let mut entries = tokio::fs::read_dir(&map_cache_dir).await?;
let mut dates = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
dates.push((name.to_string(), path));
}
}
}
}
if dates.is_empty() {
return Ok(None);
}
dates.sort_by(|a, b| b.0.cmp(&a.0));
Ok(Some(dates[0].1.clone()))
}
pub struct ProcessResult {
pub changes: Vec<CompanyEventChange>,
}
pub fn process_batch(
new_events: &[CompanyEvent],
existing: &mut HashMap<String, CompanyEvent>,
today: &str,
) -> ProcessResult {
let mut changes = Vec::new();
for new in new_events {
let key = event_key(new);
if let Some(old) = existing.get(&key) {
changes.extend(detect_changes(old, new, today));
existing.insert(key, new.clone());
continue;
}
let date_key = format!("{}|{}", new.ticker, new.date);
let mut found_old = None;
for (k, e) in existing.iter() {
if format!("{}|{}", e.ticker, e.date) == date_key && k != &key {
found_old = Some((k.clone(), e.clone()));
break;
}
}
if let Some((old_key, old_event)) = found_old {
if new.date.as_str() > today {
changes.push(CompanyEventChange {
ticker: new.ticker.clone(),
date: new.date.clone(),
field_changed: "time".to_string(),
old_value: old_event.time.clone(),
new_value: new.time.clone(),
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
});
}
existing.remove(&old_key);
}
existing.insert(key, new.clone());
}
ProcessResult { changes }
}

View File

@@ -0,0 +1,907 @@
// src/corporate/update_companies.rs
use super::{types::*, yahoo_company_extraction::*, helpers::*};
use crate::util::directories::DataPaths;
use crate::util::integrity::{DataStage, StateManager, file_reference};
use crate::util::logger;
use crate::scraper::webdriver::ChromeDriverPool;
use crate::scraper::hard_reset::perform_hard_reset;
use crate::corporate::checkpoint_helpers;
use crate::config::Config;
use tokio::sync::mpsc;
use tokio::io::AsyncWriteExt;
use tokio::fs::OpenOptions;
use tokio::time::sleep;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use futures::stream::{FuturesUnordered, StreamExt};
use anyhow::{anyhow, Result};
/// Represents a write command to be serialized through the log writer
enum LogCommand {
Write(CompanyData),
Checkpoint,
Shutdown,
}
/// Result from processing a single company
struct CompanyProcessResult {
company: CompanyData,
is_update: bool,
}
/// Check if a company needs Yahoo data processing
/// Returns true if company has incomplete data (needs processing)
fn company_needs_processing(
company_name: &str,
company_info: &CompanyData,
existing_companies: &HashMap<String, CompanyData>,
) -> bool {
// If company not in existing data at all, definitely needs processing
let Some(existing_entry) = existing_companies.get(company_name) else {
return true;
};
// Collect all ISINs this company should have
let mut required_isins = std::collections::HashSet::new();
for figi_infos in company_info.securities.values() {
for figi_info in figi_infos {
if !figi_info.isin.is_empty() {
required_isins.insert(figi_info.isin.clone());
}
}
}
// Check each required ISIN
for isin in required_isins {
// Check if this ISIN exists in the company's ticker map
if let Some(map) = &existing_entry.isin_tickers_map {
if let Some(tickers) = map.get(&isin) {
// Check if this ISIN has valid Yahoo data
let has_valid_yahoo = tickers.iter().any(|t| {
t.starts_with("YAHOO:") &&
t != "YAHOO:ERROR" //&& // Error marker means needs retry
//t != "YAHOO:NO_RESULTS" // This is actually valid (legitimately not found)
});
// If no valid Yahoo data for this ISIN, company needs processing
if !has_valid_yahoo {
return true;
}
} else {
// ISIN not in map at all, needs processing
return true;
}
} else {
// No isin_tickers_map at all, needs processing
return true;
}
}
// All ISINs have valid Yahoo data, skip this company
false
}
/// Abort-safe incremental JSONL persistence with proper hard reset handling
pub async fn update_companies(
paths: &DataPaths,
pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>,
config: &Config,
monitoring: &Option<crate::monitoring::MonitoringHandle>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 100;
// Wrap pool in mutex for potential replacement
let pool_mutex = Arc::new(tokio::sync::Mutex::new(Arc::clone(pool)));
// Synchronization for hard reset
let reset_in_progress = Arc::new(tokio::sync::Mutex::new(false));
let securities_path = paths.figi_securities_dir();
let securities_checkpoint = securities_path.join("common_stocks.jsonl");
let securities_log = securities_path.join("common_stocks.log.jsonl");
if !securities_checkpoint.exists() {
logger::log_warn("No common_stocks.jsonl found").await;
return Ok(0);
}
// Load securities from checkpoint and replay log
logger::log_info("Loading common stocks from JSONL checkpoint and log...").await;
let securities = load_securities_from_jsonl(&securities_checkpoint, &securities_log).await?;
logger::log_info(&format!("Loaded {} companies from common stocks", securities.len())).await;
let companies_path = paths.data_dir().join("companies.jsonl");
let log_path = paths.data_dir().join("companies_updates.log");
if let Some(parent) = companies_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
let manager = StateManager::new(paths.integrity_dir()).await?;
let content_reference = file_reference(&companies_path);
let step_name = "corporate_companies_update";
let data_stage = DataStage::Data;
if manager.is_step_valid(step_name).await? {
logger::log_info(" Companies data already built and valid").await;
return Ok(securities.len());
}
logger::log_info(" Companies data incomplete or missing, proceeding with update").await;
let entry: crate::util::integrity::StateEntry = manager.create_entry(step_name.to_string(), content_reference, data_stage).await?;
// === RECOVERY PHASE: Load checkpoint + replay log ===
let existing_companies = checkpoint_helpers::load_checkpoint_with_log(
&companies_path,
&log_path,
"companies.jsonl"
).await?;
// === SETUP LOG WRITER TASK ===
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
let log_file_init = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await?;
let companies_path_clone = companies_path.clone();
let log_path_clone = log_path.clone();
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
// Clone the Arc for the writer task (Arc clone is cheap, just increments ref count)
let existing_companies_writer_for_task = Arc::clone(&existing_companies_writer);
let write_tx_for_writer = write_tx.clone();
let writer_task = tokio::spawn(async move {
let mut log_file = log_file_init;
let mut writes_since_fsync = 0;
let mut last_fsync = std::time::Instant::now();
let mut updates_since_checkpoint = 0;
let mut count = 0;
let mut new_count = 0;
let mut updated_count = 0;
while let Some(cmd) = write_rx.recv().await {
match cmd {
LogCommand::Write(company) => {
// Write to log
let line = serde_json::to_string(&company).unwrap();
if let Err(e) = log_file.write_all(line.as_bytes()).await {
logger::log_error(&format!("Failed to write to log: {}", e)).await;
break;
}
if let Err(e) = log_file.write_all(b"\n").await {
logger::log_error(&format!("Failed to write newline: {}", e)).await;
break;
}
writes_since_fsync += 1;
updates_since_checkpoint += 1;
count += 1;
// Update in-memory state
let mut existing_companies = existing_companies_writer_for_task.lock().await;
let is_update = existing_companies.contains_key(&company.name);
existing_companies.insert(company.name.clone(), company);
drop(existing_companies);
if is_update {
updated_count += 1;
} else {
new_count += 1;
}
// Batched + time-based fsync
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
if should_fsync {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync: {}", e)).await;
break;
}
writes_since_fsync = 0;
last_fsync = std::time::Instant::now();
}
}
LogCommand::Checkpoint => {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush before checkpoint: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync before checkpoint: {}", e)).await;
break;
}
let existing_companies = existing_companies_writer_for_task.lock().await;
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
drop(existing_companies);
let temp_path = companies_path_clone.with_extension("tmp");
match tokio::fs::File::create(&temp_path).await {
Ok(mut temp_file) => {
let mut checkpoint_ok = true;
for company in &companies_vec {
if let Ok(line) = serde_json::to_string(company) {
if temp_file.write_all(line.as_bytes()).await.is_err() ||
temp_file.write_all(b"\n").await.is_err() {
checkpoint_ok = false;
break;
}
}
}
if checkpoint_ok {
if temp_file.flush().await.is_ok() &&
temp_file.sync_data().await.is_ok() {
drop(temp_file);
if tokio::fs::rename(&temp_path, &companies_path_clone).await.is_ok() {
if tokio::fs::remove_file(&log_path_clone).await.is_ok() {
logger::log_info(&format!(
"✓ Checkpoint created ({} companies), log cleared",
companies_vec.len()
)).await;
if let Ok(new_log) = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path_clone)
.await {
log_file = new_log;
}
}
}
}
}
}
Err(e) => {
logger::log_error(&format!("Failed to create checkpoint temp file: {}", e)).await;
}
}
updates_since_checkpoint = 0;
}
LogCommand::Shutdown => {
logger::log_info("Writer shutting down...").await;
break;
}
}
// Periodic checkpoint trigger
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
let _ = write_tx.send(LogCommand::Checkpoint).await;
}
}
// Final fsync
let _ = log_file.flush().await;
let _ = log_file.sync_data().await;
logger::log_info(&format!(
"Writer finished: {} total ({} new, {} updated)",
count, new_count, updated_count
)).await;
(count, new_count, updated_count)
});
// === MAIN PROCESSING LOOP ===
let total = securities.len();
logger::log_info(&format!("Processing {} companies with concurrency limit {}", total, CONCURRENCY_LIMIT)).await;
let mut tasks = FuturesUnordered::new();
// Build initial pending list with proper filtering
let mut pending: Vec<(String, CompanyData)> = securities.iter()
.filter(|(name, info)| company_needs_processing(name, info, &existing_companies))
.map(|(name, info)| (name.clone(), info.clone()))
.collect();
logger::log_info(&format!(
"Initial scan: {} companies need processing ({} already complete)",
pending.len(),
total - pending.len()
)).await;
let mut processed = 0;
let mut hard_reset_count = 0;
// Spawn initial batch
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
if let Some((name, company_info)) = pending.pop() {
let current_pool = {
let pool_guard = pool_mutex.lock().await;
Arc::clone(&*pool_guard)
};
let existing = existing_companies.get(&name).cloned();
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let task = tokio::spawn(async move {
process_single_company_validated(
name,
company_info,
existing,
&current_pool,
&shutdown_flag_clone,
).await
});
tasks.push(task);
}
}
// Process results and spawn new tasks
while let Some(task_result) = tasks.next().await {
// Check for shutdown
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown signal received, stopping processing").await;
break;
}
match task_result {
Ok(Ok(Some(result))) => {
// Success: send to writer
let _ = write_tx_for_writer.send(LogCommand::Write(result.company)).await;
processed += 1;
// Log progress every 100 companies
if processed % 100 == 0 {
logger::log_info(&format!(
"Progress: {}/{} companies processed ({} resets)",
processed,
total,
hard_reset_count
)).await;
}
// Spawn next task if available
if let Some((name, company_info)) = pending.pop() {
let current_pool = {
let pool_guard = pool_mutex.lock().await;
Arc::clone(&*pool_guard)
};
let existing = existing_companies.get(&name).cloned();
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let task = tokio::spawn(async move {
process_single_company_validated(
name,
company_info,
existing,
&current_pool,
&shutdown_flag_clone,
).await
});
tasks.push(task);
}
}
Ok(Ok(None)) => {
// No result (shutdown or skip)
processed += 1;
if let Some((name, company_info)) = pending.pop() {
let current_pool = {
let pool_guard = pool_mutex.lock().await;
Arc::clone(&*pool_guard)
};
let existing = existing_companies.get(&name).cloned();
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let task = tokio::spawn(async move {
process_single_company_validated(
name,
company_info,
existing,
&current_pool,
&shutdown_flag_clone,
).await
});
tasks.push(task);
}
}
Ok(Err(e)) => {
let error_msg = e.to_string();
if error_msg.contains("HARD_RESET_REQUIRED") {
// Check if reset already in progress (race condition protection)
let mut reset_lock = reset_in_progress.lock().await;
if *reset_lock {
logger::log_info("Hard reset already in progress, skipping duplicate").await;
processed += 1;
continue;
}
*reset_lock = true;
drop(reset_lock); // Release lock during reset
logger::log_error("🔴 HARD RESET THRESHOLD REACHED - INITIATING RESET SEQUENCE").await;
logger::log_warn("Draining active tasks before hard reset...").await;
// Save remaining pending count
let remaining_count = pending.len();
// Stop spawning new tasks
pending.clear();
// Wait for all active tasks to complete
let mut drained = 0;
while let Some(_) = tasks.next().await {
drained += 1;
if drained % 10 == 0 {
logger::log_info(&format!("Drained {} tasks...", drained)).await;
}
}
logger::log_info(&format!(
"All tasks drained ({} active). {} companies need reprocessing.",
drained,
remaining_count
)).await;
// Perform the actual hard reset
match perform_hard_reset(&pool_mutex, config, paths, monitoring, shutdown_flag).await {
Ok(()) => {
logger::log_info("✅ Hard reset completed successfully").await;
hard_reset_count += 1;
// Reset the error counter
{
let pool_guard = pool_mutex.lock().await;
let current_pool = Arc::clone(&*pool_guard);
current_pool.get_reset_controller().reset();
}
logger::log_info("✓ Error counter cleared").await;
// Rebuild pending list by checking which companies need processing
logger::log_info("Rebuilding pending queue with proper Yahoo data checks...").await;
// Get current state of written companies
let current_existing = {
let companies = existing_companies_writer.lock().await;
companies.clone()
};
// Reload all securities from disk (checkpoint + log)
logger::log_info("Reloading securities from JSONL...").await;
let all_securities = load_securities_from_jsonl(&securities_checkpoint, &securities_log).await?;
logger::log_info(&format!("Reloaded {} companies", all_securities.len())).await;
// Build pending list: only companies that need processing
pending = all_securities.iter()
.filter(|(name, info)| company_needs_processing(name, info, &current_existing))
.map(|(name, info)| (name.clone(), info.clone()))
.collect();
logger::log_info(&format!(
"Restarting with {} remaining companies (out of {} total)",
pending.len(),
total
)).await;
// Only continue if there's work to do
if pending.is_empty() {
logger::log_info("All companies have complete data, exiting").await;
// Clear reset flag
let mut reset_lock = reset_in_progress.lock().await;
*reset_lock = false;
drop(reset_lock);
break; // Exit main loop
}
// Respawn initial batch with NEW pool
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
if let Some((name, company_info)) = pending.pop() {
let current_pool = {
let pool_guard = pool_mutex.lock().await;
Arc::clone(&*pool_guard)
};
let existing = existing_companies.get(&name).cloned();
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let task = tokio::spawn(async move {
process_single_company_validated(
name,
company_info,
existing,
&current_pool,
&shutdown_flag_clone,
).await
});
tasks.push(task);
}
}
// Clear reset flag
let mut reset_lock = reset_in_progress.lock().await;
*reset_lock = false;
drop(reset_lock);
// ✅ Continue processing (don't spawn duplicate task)
continue;
}
Err(reset_err) => {
logger::log_error(&format!("Hard reset failed: {}", reset_err)).await;
// Clear reset flag
let mut reset_lock = reset_in_progress.lock().await;
*reset_lock = false;
drop(reset_lock);
// Exit if hard reset fails
break;
}
}
} else {
// Regular error
logger::log_warn(&format!("Company processing error: {}", error_msg)).await;
processed += 1;
// Spawn next task
if let Some((name, company_info)) = pending.pop() {
let current_pool = {
let pool_guard = pool_mutex.lock().await;
Arc::clone(&*pool_guard)
};
let existing = existing_companies.get(&name).cloned();
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let task = tokio::spawn(async move {
process_single_company_validated(
name,
company_info,
existing,
&current_pool,
&shutdown_flag_clone,
).await
});
tasks.push(task);
}
}
}
Err(e) => {
// Task panic
logger::log_error(&format!("Task panic: {}", e)).await;
processed += 1;
// Spawn next task
if let Some((name, company_info)) = pending.pop() {
let current_pool = {
let pool_guard = pool_mutex.lock().await;
Arc::clone(&*pool_guard)
};
let existing = existing_companies.get(&name).cloned();
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let task = tokio::spawn(async move {
process_single_company_validated(
name,
company_info,
existing,
&current_pool,
&shutdown_flag_clone,
).await
});
tasks.push(task);
}
}
}
}
logger::log_info("Main processing loop completed").await;
// Signal writer to finish
let _ = write_tx_for_writer.send(LogCommand::Checkpoint).await;
let _ = write_tx_for_writer.send(LogCommand::Shutdown).await;
drop(write_tx_for_writer);
// Wait for writer to finish
let (final_count, final_new, final_updated) = writer_task.await
.unwrap_or((0, 0, 0));
logger::log_info(&format!(
"✅ Completed: {} total companies ({} new, {} updated, {} hard resets)",
final_count, final_new, final_updated, hard_reset_count
)).await;
// Track completion with:
// - Content reference: All output JSONL files
// - Data stage: Data (7-day TTL) - Securities data relatively stable
// - Dependencies: LEI-FIGI mapping must be valid
// Check for shutdown BEFORE marking complete
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected during company update - marking as invalid for retry").await;
manager.mark_invalid(
entry,
format!("Invalid: processed {} of {} companies before shutdown", final_count, total),
).await?;
} else {
// Only mark complete if we got here without shutdown
manager.mark_valid(entry).await?;
}
Ok(final_count)
}
/// Loads CompanyInfo securities from checkpoint and log JSONL files
async fn load_securities_from_jsonl(
checkpoint_path: &std::path::Path,
log_path: &std::path::Path,
) -> anyhow::Result<HashMap<String, CompanyData>> {
let mut securities: HashMap<String, CompanyData> = HashMap::new();
// Load checkpoint
if checkpoint_path.exists() {
let content = tokio::fs::read_to_string(checkpoint_path).await?;
for (line_num, line) in content.lines().enumerate() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyData>(line) {
Ok(company_info) => {
securities.insert(company_info.name.clone(), company_info);
}
Err(e) => {
logger::log_warn(&format!(
"Skipping invalid line {} in checkpoint: {}",
line_num + 1, e
)).await;
}
}
}
}
// Replay log (overwrites checkpoint entries if they exist)
if log_path.exists() {
let content = tokio::fs::read_to_string(log_path).await?;
for (line_num, line) in content.lines().enumerate() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyData>(line) {
Ok(company_info) => {
securities.insert(company_info.name.clone(), company_info);
}
Err(e) => {
logger::log_warn(&format!(
"Skipping invalid line {} in log: {}",
line_num + 1, e
)).await;
}
}
}
}
Ok(securities)
}
/// Scrape with retry, validation, and shutdown awareness
async fn scrape_with_retry(
pool: &Arc<ChromeDriverPool>,
isin: &str,
max_retries: u32,
shutdown_flag: &Arc<AtomicBool>,
) -> Result<Option<YahooCompanyData>> {
let mut retries = 0;
loop {
// Check shutdown before each attempt
if shutdown_flag.load(Ordering::SeqCst) {
return Err(anyhow!("Aborted due to shutdown"));
}
if pool.should_perform_hard_reset() {
logger::log_error("HARD_RESET_REQUIRED detected before scrape attempt").await;
return Err(anyhow!("HARD_RESET_REQUIRED"));
}
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
Ok(result) => return Ok(result),
Err(e) => {
// Check if this is a hard reset required error
let error_msg = e.to_string();
if error_msg.contains("HARD_RESET_REQUIRED") {
logger::log_error(&format!(
"Hard reset required error for ISIN {}, propagating immediately",
isin
)).await;
return Err(e); // Propagate immediately, don't retry
}
if retries >= max_retries {
logger::log_error(&format!(
"All {} retries exhausted for ISIN {}: {}",
max_retries, isin, e
)).await;
return Err(e);
}
let backoff_ms = 1000 * 2u64.pow(retries);
let jitter_ms = random_range(0, 500);
let total_delay = backoff_ms + jitter_ms;
logger::log_warn(&format!(
"Retry {}/{} for ISIN {} after {}ms: {}",
retries + 1, max_retries, isin, total_delay, e
)).await;
sleep(Duration::from_millis(total_delay)).await;
retries += 1;
}
}
}
}
/// Process single company with validation and shutdown checks
async fn process_single_company_validated(
name: String,
company_info: CompanyData,
existing_entry: Option<CompanyData>,
pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<CompanyProcessResult>> {
// Check shutdown at start
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!("Shutdown detected, skipping company: {}", name)).await;
return Ok(None);
}
let is_update = existing_entry.is_some();
let mut isin_tickers_map: HashMap<String, Vec<String>> =
existing_entry
.as_ref()
.and_then(|e| e.isin_tickers_map.clone())
.unwrap_or_default();
// Collect unique ISIN-ticker pairs
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
for figi_infos in company_info.securities.values() {
for figi_info in figi_infos {
if !figi_info.isin.is_empty() {
let tickers = unique_isin_ticker_pairs
.entry(figi_info.isin.clone())
.or_insert_with(Vec::new);
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
tickers.push(figi_info.ticker.clone());
}
}
}
}
// Process each ISIN independently with per-ISIN status checking
for (isin, figi_tickers) in unique_isin_ticker_pairs {
// Check shutdown before each ISIN
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!(
"Shutdown detected while processing company: {}",
name
)).await;
break;
}
let tickers = isin_tickers_map
.entry(isin.clone())
.or_insert_with(Vec::new);
for figi_ticker in figi_tickers {
if !tickers.contains(&figi_ticker) {
tickers.push(figi_ticker);
}
}
// Check if THIS SPECIFIC ISIN has valid Yahoo data (not ERROR)
let has_valid_yahoo = tickers.iter().any(|t| {
t.starts_with("YAHOO:") && t != "YAHOO:ERROR"
// Note: YAHOO:NO_RESULTS is valid (legitimately not found)
});
if !has_valid_yahoo {
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
tickers.retain(|t| !t.starts_with("YAHOO:"));
match scrape_with_retry(pool, &isin, 3, shutdown_flag).await {
Ok(Some(details)) => {
logger::log_info(&format!(
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
details.ticker, isin, name
)).await;
tickers.push(format!("YAHOO:{}", details.ticker));
},
Ok(None) => {
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
tickers.push("YAHOO:NO_RESULTS".to_string());
},
Err(e) => {
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!("Shutdown during scrape for ISIN {}", isin)).await;
break;
}
// Check if this is a hard reset required error
let error_msg = e.to_string();
if error_msg.contains("HARD_RESET_REQUIRED") {
logger::log_error(&format!(
"Hard reset required during ISIN {} processing, propagating error",
isin
)).await;
return Err(e); // ← CRITICAL: Propagate immediately
}
logger::log_warn(&format!(
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
isin, name, e
)).await;
// Mark this ISIN as failed to enable retry
tickers.push("YAHOO:ERROR".to_string());
}
}
}
}
// Final shutdown check before returning result
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!(
"Shutdown detected, discarding incomplete result for: {}",
name
)).await;
return Ok(None);
}
if pool.should_perform_hard_reset() {
logger::log_error("HARD_RESET_REQUIRED detected during company processing").await;
return Err(anyhow!("HARD_RESET_REQUIRED"));
}
if !isin_tickers_map.is_empty() {
let company_entry = CompanyData {
name: name.clone(),
primary_isin: company_info.primary_isin.clone(),
securities: company_info.securities.clone(),
yahoo_company_data: company_info.yahoo_company_data.clone(),
isin_tickers_map: Some(isin_tickers_map),
};
Ok(Some(CompanyProcessResult {
company: company_entry,
is_update,
}))
} else {
logger::log_warn(&format!("No ISINs found for company: {}", name)).await;
Ok(None)
}
}

View File

@@ -0,0 +1,911 @@
// src/corporate/update_companies_cleanse.rs
use super::{helpers::*, types::*};
use crate::config::Config;
use crate::corporate::checkpoint_helpers;
use crate::util::directories::DataPaths;
use crate::util::integrity::{DataStage, StateManager, file_reference};
use crate::util::logger;
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
use std::result::Result::Ok;
use chrono::{Utc};
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::fs::{File, OpenOptions};
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use futures::stream::{FuturesUnordered, StreamExt};
use tokio::sync::mpsc;
/// Result of processing a single company
#[derive(Debug, Clone)]
pub enum CompanyProcessResult {
Valid(CompanyData),
FilteredLowCap { name: String, market_cap: f64 },
FilteredNoPrice { name: String },
Failed { company: CompanyData, error: String, is_transient: bool },
}
/// Represents a write command to be serialized through the log writer
enum LogCommand {
Write(CompanyData),
Checkpoint,
Shutdown,
}
/// Cleansing function to remove companies with missing essential yahoo data for integrity
pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize, anyhow::Error> {
let data_path = paths.data_dir();
let input_path = data_path.join("companies.jsonl");
let output_path = data_path.join("companies_yahoo.jsonl");
if !input_path.exists() {
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
return Ok(0);
}
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_companies_cleansed_no_data";
let content_reference = file_reference(&output_path);
if manager.is_step_valid(step_name).await? {
let output_content = tokio::fs::read_to_string(&output_path).await?;
let count = output_content.lines()
.filter(|line| !line.trim().is_empty())
.count();
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo.jsonl", count)).await;
return Ok(count);
}
let entry = manager.create_entry(
step_name.to_string(),
content_reference.clone(),
DataStage::Data,
).await?;
logger::log_info(" Cleansing companies with missing Yahoo data...").await;
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
let file = File::open(&input_path).await?;
let reader = BufReader::new(file);
let mut lines = reader.lines();
let mut output_file = File::create(&output_path).await?;
let mut valid_count = 0;
let mut removed_count = 0;
let mut total_count = 0;
while let Some(line) = lines.next_line().await? {
if line.trim().is_empty() {
continue;
}
total_count += 1;
let company: CompanyData = match serde_json::from_str(&line) {
Ok(c) => c,
Err(e) => {
logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await;
continue;
}
};
let has_valid_yahoo = company.isin_tickers_map
.as_ref()
.map(|map| {
map.values()
.flatten()
.any(|ticker| {
ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
})
})
.unwrap_or(false);
if has_valid_yahoo {
let json_line = serde_json::to_string(&company)?;
output_file.write_all(json_line.as_bytes()).await?;
output_file.write_all(b"\n").await?;
valid_count += 1;
} else {
removed_count += 1;
if removed_count <= 5 {
logger::log_info(&format!(" Removed company '{}' (no valid Yahoo ticker)", company.name)).await;
}
}
if total_count % 1000 == 0 {
logger::log_info(&format!(" Processed {} companies...", total_count)).await;
}
}
output_file.flush().await?;
logger::log_info(&format!(
" ✓ Cleansing complete: {} total → {} valid, {} removed",
total_count, valid_count, removed_count
)).await;
// Track completion with:
// - Content reference: All event directories
// - Data stage: Data (7-day TTL by default)
// - Dependencies: Depends on cleaned companies data
manager.mark_valid(entry).await?;
Ok(valid_count)
}
/// Yahoo Low Profile Cleansing WITH ABORT-SAFE INCREMENTAL PERSISTENCE
///
/// # Features
/// - Graceful shutdown (abort-safe)
/// - Task panic isolation (tasks fail independently)
/// - Crash-safe persistence (checkpoint + log with fsync)
/// - Smart skip logic (only process incomplete data)
/// - Uses pending queue instead of retry mechanism
/// - Reuses companies_update.log for persistence
///
/// # Persistence Strategy
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
/// - Log: companies_update.log (append-only updates)
/// - On restart: Load checkpoint + replay log
/// - Periodic checkpoints (every 50 companies)
/// - Batched fsync (every 10 writes or 10 seconds)
pub async fn companies_yahoo_cleansed_low_profile(
paths: &DataPaths,
_config: &Config,
yahoo_pool: Arc<YahooClientPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 50; // Limit parallel validation tasks
let data_path = paths.data_dir();
// File paths (reusing companies_update.log)
let input_path = data_path.join("companies_yahoo.jsonl");
let checkpoint_path = data_path.join("companies_yahoo_cleaned.jsonl");
let log_path = data_path.join("companies_updates.log");
// Check input exists
if !input_path.exists() {
logger::log_warn(" companies_yahoo.jsonl not found, skipping low profile cleansing").await;
return Ok(0);
}
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_companies_cleansed_low_profile";
let content_reference = file_reference(&checkpoint_path);
if manager.is_step_valid(step_name).await? {
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
let count = checkpoint_content.lines()
.filter(|line| !line.trim().is_empty())
.count();
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo_cleaned.jsonl", count)).await;
return Ok(count);
}
let entry = manager.create_entry(
step_name.to_string(),
content_reference.clone(),
DataStage::Data,
).await?;
logger::log_info(" Cleansing companies with low Yahoo profile...").await;
// === RECOVERY PHASE: Load checkpoint + replay log ===
let mut existing_companies: HashMap<String, CompanyData> = HashMap::new();
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
if checkpoint_path.exists() {
logger::log_info("Loading checkpoint from companies_yahoo_cleaned.jsonl...").await;
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
for line in checkpoint_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
}
}
}
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
}
if log_path.exists() {
logger::log_info("Replaying update log...").await;
let log_content = tokio::fs::read_to_string(&log_path).await?;
let mut replayed = 0;
for line in log_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
replayed += 1;
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
if replayed > 0 {
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
}
}
// === LOAD INPUT COMPANIES ===
logger::log_info(&format!("Loading companies from: {:?}", input_path)).await;
let input_companies = load_companies_from_jsonl(&input_path).await?;
logger::log_info(&format!("Loaded {} companies from input", input_companies.len())).await;
// === BUILD PENDING LIST (smart skip logic) ===
let mut pending: Vec<CompanyData> = input_companies
.into_iter()
.filter(|company| company_needs_processing(company, &existing_companies))
.collect();
logger::log_info(&format!(
"Initial scan: {} companies need processing ({} already complete)",
pending.len(),
existing_companies.len()
)).await;
// === CONSOLIDATE LOG BEFORE EARLY EXIT ===
if pending.is_empty() {
logger::log_info(" ✓ All companies already processed").await;
// Consolidate log into checkpoint before exiting
if checkpoint_helpers::log_has_content(&log_path).await {
checkpoint_helpers::consolidate_checkpoint(&checkpoint_path, &log_path, &existing_companies).await?;
}
return Ok(existing_companies.len());
}
// === SETUP LOG WRITER TASK ===
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
let log_file_init = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await?;
let checkpoint_path_clone = checkpoint_path.clone();
let log_path_clone = log_path.clone();
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
let existing_companies_writer_for_task = Arc::clone(&existing_companies_writer);
let write_tx_for_writer = write_tx.clone();
let writer_task = tokio::spawn(async move {
let mut log_file = log_file_init;
let mut writes_since_fsync = 0;
let mut last_fsync = std::time::Instant::now();
let mut updates_since_checkpoint = 0;
let mut count = 0;
let mut new_count = 0;
let mut updated_count = 0;
while let Some(cmd) = write_rx.recv().await {
match cmd {
LogCommand::Write(company) => {
// Write to log
let line = serde_json::to_string(&company).unwrap();
if let Err(e) = log_file.write_all(line.as_bytes()).await {
logger::log_error(&format!("Failed to write to log: {}", e)).await;
break;
}
if let Err(e) = log_file.write_all(b"\n").await {
logger::log_error(&format!("Failed to write newline: {}", e)).await;
break;
}
writes_since_fsync += 1;
updates_since_checkpoint += 1;
count += 1;
// Update in-memory state
let mut existing_companies = existing_companies_writer_for_task.lock().await;
let is_update = existing_companies.contains_key(&company.name);
existing_companies.insert(company.name.clone(), company);
drop(existing_companies);
if is_update {
updated_count += 1;
} else {
new_count += 1;
}
// Batched + time-based fsync
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
if should_fsync {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync: {}", e)).await;
break;
}
writes_since_fsync = 0;
last_fsync = std::time::Instant::now();
}
}
LogCommand::Checkpoint => {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush before checkpoint: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync before checkpoint: {}", e)).await;
break;
}
let existing_companies = existing_companies_writer_for_task.lock().await;
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
drop(existing_companies);
let temp_path = checkpoint_path_clone.with_extension("tmp");
match tokio::fs::File::create(&temp_path).await {
Ok(mut temp_file) => {
let mut checkpoint_ok = true;
for company in &companies_vec {
if let Ok(line) = serde_json::to_string(company) {
if temp_file.write_all(line.as_bytes()).await.is_err() ||
temp_file.write_all(b"\n").await.is_err() {
checkpoint_ok = false;
break;
}
}
}
if checkpoint_ok {
if temp_file.flush().await.is_ok() &&
temp_file.sync_data().await.is_ok() {
drop(temp_file);
if tokio::fs::rename(&temp_path, &checkpoint_path_clone).await.is_ok() {
if tokio::fs::remove_file(&log_path_clone).await.is_ok() {
logger::log_info(&format!(
"✓ Checkpoint created ({} companies), log cleared",
companies_vec.len()
)).await;
if let Ok(new_log) = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path_clone)
.await {
log_file = new_log;
}
}
}
}
}
}
Err(e) => {
logger::log_error(&format!("Failed to create checkpoint temp file: {}", e)).await;
}
}
updates_since_checkpoint = 0;
}
LogCommand::Shutdown => {
logger::log_info("Writer shutting down...").await;
break;
}
}
// Periodic checkpoint trigger
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
let _ = write_tx_for_writer.send(LogCommand::Checkpoint).await;
}
}
// Final fsync
let _ = log_file.flush().await;
let _ = log_file.sync_data().await;
logger::log_info(&format!(
"Writer finished: {} total ({} new, {} updated)",
count, new_count, updated_count
)).await;
(count, new_count, updated_count)
});
// Wrap paths in Arc for safe sharing across tasks
let paths = Arc::new((*paths).clone());
// === MAIN PROCESSING LOOP WITH TASK PANIC ISOLATION ===
let total = pending.len();
let mut tasks = FuturesUnordered::new();
// Counters
let processed = Arc::new(AtomicUsize::new(0));
let valid_count = Arc::new(AtomicUsize::new(0));
let filtered_low_cap = Arc::new(AtomicUsize::new(0));
let filtered_no_price = Arc::new(AtomicUsize::new(0));
let failed_count = Arc::new(AtomicUsize::new(0));
// Spawn initial batch
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
// Process results and spawn new tasks (with task panic isolation)
while let Some(task_result) = tasks.next().await {
// Check for shutdown
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown signal received, stopping processing").await;
break;
}
match task_result {
Ok(Ok(_)) => {
// Success - spawn next task
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
Ok(Err(e)) => {
// Processing error
logger::log_error(&format!("Company processing error: {}", e)).await;
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
Err(e) => {
// Task panic (isolated - doesn't crash entire process)
logger::log_error(&format!("Task panic: {}", e)).await;
if let Some(company) = pending.pop() {
spawn_validation_task(
company,
&yahoo_pool,
&paths,
&write_tx,
shutdown_flag,
&processed,
&valid_count,
&filtered_low_cap,
&filtered_no_price,
&failed_count,
total,
&mut tasks,
);
}
}
}
}
logger::log_info("Main processing loop completed").await;
// Signal writer to finish
let _ = write_tx.send(LogCommand::Checkpoint).await;
let _ = write_tx.send(LogCommand::Shutdown).await;
drop(write_tx);
// Wait for writer to finish
let (final_count, final_new, final_updated) = writer_task.await
.unwrap_or((0, 0, 0));
let final_valid = valid_count.load(Ordering::SeqCst);
let final_filtered_low_cap = filtered_low_cap.load(Ordering::SeqCst);
let final_filtered_no_price = filtered_no_price.load(Ordering::SeqCst);
let final_failed = failed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
"✅ Completed: {} total companies ({} new, {} updated)",
final_count, final_new, final_updated
)).await;
logger::log_info(&format!(
" Valid: {}, Filtered (low cap): {}, Filtered (no price): {}, Failed: {}",
final_valid, final_filtered_low_cap, final_filtered_no_price, final_failed
)).await;
// === VERIFY AND RECREATE FINAL OUTPUT ===
logger::log_info("Verifying final output integrity...").await;
let final_companies_map = existing_companies_writer.lock().await;
let expected_count = final_companies_map.len();
// Always write final consolidated checkpoint
let temp_checkpoint = checkpoint_path.with_extension("tmp");
let mut temp_file = File::create(&temp_checkpoint).await?;
for company in final_companies_map.values() {
let json_line = serde_json::to_string(company)?;
temp_file.write_all(json_line.as_bytes()).await?;
temp_file.write_all(b"\n").await?;
}
temp_file.flush().await?;
temp_file.sync_data().await?;
drop(temp_file);
tokio::fs::rename(&temp_checkpoint, &checkpoint_path).await?;
drop(final_companies_map);
// Clear log since everything is in checkpoint
if log_path.exists() {
tokio::fs::remove_file(&log_path).await.ok();
}
logger::log_info(&format!("✓ Final output: {} companies in {:?}", expected_count, checkpoint_path)).await;
// Shutdown Yahoo pool
yahoo_pool.shutdown().await?;
// Track completion with:
// - Content reference: All event directories
// - Data stage: Data (7-day TTL by default)
// - Dependencies: Depends on cleaned companies data
if !shutdown_flag.load(Ordering::SeqCst) {
manager.mark_valid(entry).await?;
}
Ok(final_count)
}
/// Helper function to spawn a validation task (reduces code duplication)
fn spawn_validation_task(
company: CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &Arc<DataPaths>,
write_tx: &mpsc::Sender<LogCommand>,
shutdown_flag: &Arc<AtomicBool>,
processed: &Arc<AtomicUsize>,
valid_count: &Arc<AtomicUsize>,
filtered_low_cap: &Arc<AtomicUsize>,
filtered_no_price: &Arc<AtomicUsize>,
failed_count: &Arc<AtomicUsize>,
total: usize,
tasks: &mut FuturesUnordered<tokio::task::JoinHandle<anyhow::Result<Option<()>>>>,
) {
let yahoo_pool_clone = Arc::clone(yahoo_pool);
let paths_clone = Arc::clone(paths);
let shutdown_flag_clone = Arc::clone(shutdown_flag);
let write_tx_clone = write_tx.clone();
let processed_clone = Arc::clone(processed);
let valid_count_clone = Arc::clone(valid_count);
let filtered_low_cap_clone = Arc::clone(filtered_low_cap);
let filtered_no_price_clone = Arc::clone(filtered_no_price);
let failed_count_clone = Arc::clone(failed_count);
let task = tokio::spawn(async move {
// Check shutdown at start
if shutdown_flag_clone.load(Ordering::SeqCst) {
return Ok::<_, anyhow::Error>(None);
}
let result = process_company_with_validation(
&company,
&yahoo_pool_clone,
&*paths_clone,
).await;
match result {
CompanyProcessResult::Valid(validated_company) => {
// Send to writer
let _ = write_tx_clone.send(LogCommand::Write(validated_company)).await;
valid_count_clone.fetch_add(1, Ordering::SeqCst);
}
CompanyProcessResult::FilteredLowCap { name, market_cap } => {
filtered_low_cap_clone.fetch_add(1, Ordering::SeqCst);
if filtered_low_cap_clone.load(Ordering::SeqCst) <= 10 {
logger::log_info(&format!(" Filtered {} - low market cap: {:.0} EUR", name, market_cap)).await;
}
}
CompanyProcessResult::FilteredNoPrice { name } => {
filtered_no_price_clone.fetch_add(1, Ordering::SeqCst);
if filtered_no_price_clone.load(Ordering::SeqCst) <= 10 {
logger::log_info(&format!(" Filtered {} - no recent price data", name)).await;
}
}
CompanyProcessResult::Failed { company: failed_company, error, is_transient: _ } => {
failed_count_clone.fetch_add(1, Ordering::SeqCst);
logger::log_warn(&format!(" Failed to process '{}': {}", failed_company.name, error)).await;
}
}
// Progress reporting
let current = processed_clone.fetch_add(1, Ordering::SeqCst) + 1;
if current % 100 == 0 {
logger::log_info(&format!(
"Progress: {}/{} ({} valid, {} low cap, {} no price, {} failed)",
current, total,
valid_count_clone.load(Ordering::SeqCst),
filtered_low_cap_clone.load(Ordering::SeqCst),
filtered_no_price_clone.load(Ordering::SeqCst),
failed_count_clone.load(Ordering::SeqCst)
)).await;
}
Ok(None::<()>)
});
tasks.push(task);
}
/// Process a single company with full error categorization
async fn process_company_with_validation(
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> CompanyProcessResult {
// Extract Yahoo ticker
let ticker = match extract_first_yahoo_ticker(company) {
Some(t) => t,
None => {
return CompanyProcessResult::Failed {
company: company.clone(),
error: "No valid Yahoo ticker found".to_string(),
is_transient: false, // Permanent - no ticker means no data
};
}
};
// Fetch core modules from Yahoo
let summary = match yahoo_pool.get_quote_summary(
&ticker,
&QuoteSummaryModule::core_modules(),
).await {
Ok(s) => s,
Err(e) => {
let error_msg = e.to_string();
let is_transient = is_transient_error(&error_msg);
return CompanyProcessResult::Failed {
company: company.clone(),
error: format!("API error fetching summary: {}", error_msg),
is_transient,
};
}
};
// Validate market cap
let market_cap = extract_market_cap(&summary);
if market_cap < 100_000_000.0 {
return CompanyProcessResult::FilteredLowCap {
name: company.name.clone(),
market_cap,
};
}
// Validate recent price activity
let has_recent_price = match check_recent_price_activity(yahoo_pool, &ticker).await {
Ok(has) => has,
Err(e) => {
let error_msg = e.to_string();
let is_transient = is_transient_error(&error_msg);
return CompanyProcessResult::Failed {
company: company.clone(),
error: format!("API error fetching price history: {}", error_msg),
is_transient,
};
}
};
if !has_recent_price {
return CompanyProcessResult::FilteredNoPrice {
name: company.name.clone(),
};
}
// Save core data
if let Err(e) = save_company_core_data(paths, &company.name, &summary).await {
logger::log_warn(&format!(
" Failed to save core data for {}: {}",
company.name, e
)).await;
}
CompanyProcessResult::Valid(company.clone())
}
/// Determine if an error is transient (should retry) or permanent (skip)
fn is_transient_error(error: &str) -> bool {
let error_lower = error.to_lowercase();
// Transient errors (network, rate limiting, timeouts)
let transient_patterns = [
"timeout",
"timed out",
"connection",
"network",
"rate limit",
"too many requests",
"429",
"503",
"502",
"500",
"temporarily",
"unavailable",
];
for pattern in &transient_patterns {
if error_lower.contains(pattern) {
return true;
}
}
// Permanent errors (invalid ticker, no data, parsing errors)
let permanent_patterns = [
"404",
"not found",
"invalid",
"no data",
"parse error",
"400",
"401",
"403",
];
for pattern in &permanent_patterns {
if error_lower.contains(pattern) {
return false;
}
}
// Default: treat unknown errors as transient (safer to retry)
true
}
fn extract_market_cap(summary: &crate::scraper::yahoo::QuoteSummary) -> f64 {
let price_module = match summary.modules.get("price") {
Some(m) => m,
None => return 0.0,
};
let market_cap_raw = price_module
.get("marketCap")
.and_then(|v| v.get("raw"))
.and_then(|v| v.as_f64())
.unwrap_or(0.0);
let currency = price_module
.get("currency")
.and_then(|v| v.as_str())
.unwrap_or("USD");
let market_cap_eur = match currency {
"EUR" => market_cap_raw,
"USD" => market_cap_raw * 0.92,
"GBP" => market_cap_raw * 1.17,
"JPY" => market_cap_raw * 0.0061,
"CHF" => market_cap_raw * 1.05,
_ => market_cap_raw * 0.92,
};
market_cap_eur
}
async fn check_recent_price_activity(
yahoo_pool: &Arc<YahooClientPool>,
ticker: &str,
) -> anyhow::Result<bool> {
let now = Utc::now().timestamp();
let one_year_ago = now - (365 * 24 * 60 * 60);
let sixty_days_ago = now - (60 * 24 * 60 * 60);
let chart_data = yahoo_pool.get_chart_data(
ticker,
"1d",
sixty_days_ago,
now,
).await?;
if chart_data.quotes.is_empty() {
return Ok(false);
}
let most_recent_timestamp = chart_data.quotes
.iter()
.map(|q| q.timestamp)
.max()
.unwrap_or(0);
Ok(most_recent_timestamp >= one_year_ago)
}
async fn save_company_core_data(
paths: &DataPaths,
company_name: &str,
summary: &crate::scraper::yahoo::QuoteSummary,
) -> anyhow::Result<()> {
use tokio::fs;
let safe_name = sanitize_company_name(company_name);
let company_dir = paths.corporate_dir().join(&safe_name).join("core");
fs::create_dir_all(&company_dir).await?;
let data_path = company_dir.join("data.jsonl");
let json_line = serde_json::to_string(summary)?;
let mut file = fs::File::create(&data_path).await?;
file.write_all(json_line.as_bytes()).await?;
file.write_all(b"\n").await?;
file.flush().await?;
Ok(())
}
/// Check if a company needs processing (validation check)
fn company_needs_processing(
company: &CompanyData,
existing_companies: &HashMap<String, CompanyData>,
) -> bool {
// If company exists in cleaned output, skip it
!existing_companies.contains_key(&company.name)
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,578 +0,0 @@
// src/corporate/update_parallel.rs - UPDATED WITH DATA INTEGRITY FIXES
// PARALLELIZED VERSION with atomic commits and validation
//
// Key improvements over original:
// - Page validation to prevent stale content extraction
// - Shutdown-aware task processing
// - Better error recovery with browser state cleanup
// - All original fsync and checkpoint logic preserved
use super::{types::*, yahoo::*, helpers::*};
use crate::util::directories::DataPaths;
use crate::util::logger;
use crate::scraper::webdriver::ChromeDriverPool;
use rand::Rng;
use tokio::sync::mpsc;
use tokio::io::AsyncWriteExt;
use tokio::fs::OpenOptions;
use tokio::time::sleep;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use futures::stream::{FuturesUnordered, StreamExt};
use anyhow::{anyhow, Context, Result};
/// Represents a write command to be serialized through the log writer
enum LogCommand {
Write(CompanyCrossPlatformInfo),
Checkpoint,
Shutdown,
}
/// Result from processing a single company
struct CompanyProcessResult {
company: CompanyCrossPlatformInfo,
is_update: bool,
}
/// UPDATED: Abort-safe incremental JSONL persistence with validation
///
/// New safety features:
/// - Page validation before extraction
/// - Shutdown checks at all critical points
/// - Browser state cleanup on errors
/// - All writes still atomic with fsync
pub async fn build_companies_jsonl_streaming_parallel(
paths: &DataPaths,
pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 50;
const FSYNC_BATCH_SIZE: usize = 10;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 100;
let path = DataPaths::new(".")?;
let corporate_path = path.data_dir().join("corporate").join("by_name");
let securities_path = corporate_path.join("common_stocks.json");
if !securities_path.exists() {
logger::log_warn("No common_stocks.json found").await;
return Ok(0);
}
let content = tokio::fs::read_to_string(securities_path).await?;
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
let companies_path = paths.data_dir().join("companies.jsonl");
let log_path = paths.data_dir().join("companies_updates.log");
if let Some(parent) = companies_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
// === RECOVERY PHASE: Load checkpoint + replay log ===
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
if companies_path.exists() {
logger::log_info("Loading checkpoint from companies.jsonl...").await;
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
for line in existing_content.lines() {
if line.trim().is_empty() {
continue;
}
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
}
}
}
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
}
if log_path.exists() {
logger::log_info("Replaying update log...").await;
let log_content = tokio::fs::read_to_string(&log_path).await?;
let mut replayed = 0;
for line in log_content.lines() {
if line.trim().is_empty() {
continue;
}
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
replayed += 1;
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
if replayed > 0 {
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
}
}
// === SETUP LOG WRITER TASK ===
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
let log_file_init = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await?;
let companies_path_clone = companies_path.clone();
let log_path_clone = log_path.clone();
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
let write_tx_for_writer = write_tx.clone();
let writer_task = tokio::spawn(async move {
let mut log_file = log_file_init;
let mut writes_since_fsync = 0;
let mut last_fsync = std::time::Instant::now();
let mut updates_since_checkpoint = 0;
let mut count = 0;
let mut new_count = 0;
let mut updated_count = 0;
while let Some(cmd) = write_rx.recv().await {
match cmd {
LogCommand::Write(company) => {
// Write to log
let line = serde_json::to_string(&company).unwrap();
if let Err(e) = log_file.write_all(line.as_bytes()).await {
logger::log_error(&format!("Failed to write to log: {}", e)).await;
break;
}
if let Err(e) = log_file.write_all(b"\n").await {
logger::log_error(&format!("Failed to write newline: {}", e)).await;
break;
}
writes_since_fsync += 1;
updates_since_checkpoint += 1;
count += 1;
// Update in-memory state
let mut existing_companies = existing_companies_writer.lock().await;
let is_update = existing_companies.contains_key(&company.name);
existing_companies.insert(company.name.clone(), company);
drop(existing_companies);
if is_update {
updated_count += 1;
} else {
new_count += 1;
}
// Batched + time-based fsync
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
if should_fsync {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync: {}", e)).await;
break;
}
writes_since_fsync = 0;
last_fsync = std::time::Instant::now();
}
}
LogCommand::Checkpoint => {
if let Err(e) = log_file.flush().await {
logger::log_error(&format!("Failed to flush before checkpoint: {}", e)).await;
break;
}
if let Err(e) = log_file.sync_data().await {
logger::log_error(&format!("Failed to fsync before checkpoint: {}", e)).await;
break;
}
let existing_companies = existing_companies_writer.lock().await;
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
drop(existing_companies);
let temp_path = companies_path_clone.with_extension("tmp");
match tokio::fs::File::create(&temp_path).await {
Ok(mut temp_file) => {
let mut checkpoint_ok = true;
for company in &companies_vec {
if let Ok(line) = serde_json::to_string(company) {
if temp_file.write_all(line.as_bytes()).await.is_err() ||
temp_file.write_all(b"\n").await.is_err() {
checkpoint_ok = false;
break;
}
}
}
if checkpoint_ok {
if temp_file.flush().await.is_ok() &&
temp_file.sync_data().await.is_ok() {
drop(temp_file);
if tokio::fs::rename(&temp_path, &companies_path_clone).await.is_ok() {
if tokio::fs::remove_file(&log_path_clone).await.is_ok() {
logger::log_info(&format!(
"✓ Checkpoint created ({} companies), log cleared",
companies_vec.len()
)).await;
if let Ok(new_log) = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path_clone)
.await {
log_file = new_log;
}
}
}
}
}
}
Err(e) => {
logger::log_error(&format!("Failed to create checkpoint temp file: {}", e)).await;
}
}
updates_since_checkpoint = 0;
}
LogCommand::Shutdown => {
logger::log_info("Writer shutting down...").await;
break;
}
}
// Periodic checkpoint trigger
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
let _ = write_tx.send(LogCommand::Checkpoint).await;
}
}
// Final fsync
let _ = log_file.flush().await;
let _ = log_file.sync_data().await;
logger::log_info(&format!(
"Writer finished: {} total ({} new, {} updated)",
count, new_count, updated_count
)).await;
(count, new_count, updated_count)
});
// === PARALLEL PROCESSING PHASE ===
logger::log_info(&format!(
"Starting parallel processing of {} companies (concurrency limit: {})",
securities.len(),
CONCURRENCY_LIMIT
)).await;
let mut processing_tasks = FuturesUnordered::new();
let mut processed = 0;
let total = securities.len();
for (name, company_info) in securities.into_iter() {
// Check shutdown before creating new tasks
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected, stopping task creation").await;
break;
}
// Wait if we hit concurrency limit
while processing_tasks.len() >= CONCURRENCY_LIMIT {
if let Some(result) = processing_tasks.next().await {
match result {
Ok(Ok(Some(company_result))) => {
let company_result: CompanyProcessResult = company_result;
let _ = write_tx_for_writer.send(LogCommand::Write(company_result.company)).await?;
processed += 1;
}
Ok(Ok(None)) => {
processed += 1;
}
Ok(Err(e)) => {
logger::log_warn(&format!("Company processing error: {}", e)).await;
processed += 1;
}
Err(e) => {
logger::log_error(&format!("Task panic: {}", e)).await;
processed += 1;
}
}
}
if shutdown_flag.load(Ordering::SeqCst) {
break;
}
}
if shutdown_flag.load(Ordering::SeqCst) {
break;
}
// Spawn new task
let pool = pool.clone();
let shutdown_flag = shutdown_flag.clone();
let existing_entry = existing_companies.get(&name).cloned();
let task = tokio::spawn(async move {
process_single_company_validated(
name,
company_info,
existing_entry,
&pool,
&shutdown_flag
).await
});
processing_tasks.push(task);
if processed % 10 == 0 && processed > 0 {
logger::log_info(&format!("Progress: {}/{} companies processed", processed, total)).await;
}
}
// Wait for remaining tasks
logger::log_info(&format!(
"Waiting for {} remaining tasks to complete...",
processing_tasks.len()
)).await;
while let Some(result) = processing_tasks.next().await {
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected during final task wait").await;
break;
}
match result {
Ok(Ok(Some(company_result))) => {
if write_tx_for_writer.send(LogCommand::Write(company_result.company)).await.is_err() {
logger::log_error("Writer task died").await;
break;
}
processed += 1;
}
Ok(Ok(None)) => {
processed += 1;
}
Ok(Err(e)) => {
logger::log_warn(&format!("Company processing error: {}", e)).await;
processed += 1;
}
Err(e) => {
logger::log_error(&format!("Task panic: {}", e)).await;
processed += 1;
}
}
}
// Signal writer to finish
let _ = write_tx_for_writer.send(LogCommand::Checkpoint).await;
let _ = write_tx_for_writer.send(LogCommand::Shutdown).await;
drop(write_tx_for_writer);
// Wait for writer to finish
let (final_count, final_new, final_updated) = writer_task.await
.unwrap_or((0, 0, 0));
logger::log_info(&format!(
"Completed: {} total companies ({} new, {} updated)",
final_count, final_new, final_updated
)).await;
Ok(final_count)
}
/// Scrape with retry, validation, and shutdown awareness
async fn scrape_with_retry(
pool: &Arc<ChromeDriverPool>,
isin: &str,
max_retries: u32,
shutdown_flag: &Arc<AtomicBool>,
) -> Result<Option<YahooCompanyDetails>> {
let mut retries = 0;
loop {
// Check shutdown before each attempt
if shutdown_flag.load(Ordering::SeqCst) {
return Err(anyhow!("Aborted due to shutdown"));
}
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
Ok(result) => return Ok(result),
Err(e) => {
if retries >= max_retries {
logger::log_error(&format!(
"All {} retries exhausted for ISIN {}: {}",
max_retries, isin, e
)).await;
return Err(e);
}
let backoff_ms = 1000 * 2u64.pow(retries);
let jitter_ms = random_range(0, 500);
let total_delay = backoff_ms + jitter_ms;
logger::log_warn(&format!(
"Retry {}/{} for ISIN {} after {}ms: {}",
retries + 1, max_retries, isin, total_delay, e
)).await;
sleep(Duration::from_millis(total_delay)).await;
retries += 1;
}
}
}
}
/// UPDATED: Process single company with validation and shutdown checks
async fn process_single_company_validated(
name: String,
company_info: CompanyInfo,
existing_entry: Option<CompanyCrossPlatformInfo>,
pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<CompanyProcessResult>> {
// Check shutdown at start
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!("Shutdown detected, skipping company: {}", name)).await;
return Ok(None);
}
let is_update = existing_entry.is_some();
let mut isin_tickers_map: HashMap<String, Vec<String>> =
existing_entry
.as_ref()
.map(|e| e.isin_tickers_map.clone())
.unwrap_or_default();
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
// Collect unique ISIN-ticker pairs
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
for figi_infos in company_info.securities.values() {
for figi_info in figi_infos {
if !figi_info.isin.is_empty() {
let tickers = unique_isin_ticker_pairs
.entry(figi_info.isin.clone())
.or_insert_with(Vec::new);
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
tickers.push(figi_info.ticker.clone());
}
}
}
}
// Process each ISIN with validation
for (isin, figi_tickers) in unique_isin_ticker_pairs {
// Check shutdown before each ISIN
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!(
"Shutdown detected while processing company: {}",
name
)).await;
break;
}
let tickers = isin_tickers_map
.entry(isin.clone())
.or_insert_with(Vec::new);
for figi_ticker in figi_tickers {
if !tickers.contains(&figi_ticker) {
tickers.push(figi_ticker);
}
}
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
if !has_yahoo_ticker {
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
match scrape_with_retry(pool, &isin, 3, shutdown_flag).await {
Ok(Some(details)) => {
logger::log_info(&format!(
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
details.ticker, isin, name
)).await;
tickers.push(format!("YAHOO:{}", details.ticker));
if sector.is_none() && details.sector.is_some() {
sector = details.sector.clone();
logger::log_info(&format!(" Sector: {}", details.sector.as_ref().unwrap())).await;
}
if exchange.is_none() && details.exchange.is_some() {
exchange = details.exchange.clone();
logger::log_info(&format!(" Exchange: {}", details.exchange.as_ref().unwrap())).await;
}
},
Ok(None) => {
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
tickers.push("YAHOO:NO_RESULTS".to_string());
},
Err(e) => {
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!("Shutdown during scrape for ISIN {}", isin)).await;
break;
}
logger::log_warn(&format!(
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
isin, name, e
)).await;
// Continue with next ISIN
}
}
}
}
// Final shutdown check before returning result
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!(
"Shutdown detected, discarding incomplete result for: {}",
name
)).await;
return Ok(None);
}
if !isin_tickers_map.is_empty() {
let company_entry = CompanyCrossPlatformInfo {
name: name.clone(),
isin_tickers_map,
sector,
exchange,
};
Ok(Some(CompanyProcessResult {
company: company_entry,
is_update,
}))
} else {
logger::log_warn(&format!("No ISINs found for company: {}", name)).await;
Ok(None)
}
}

View File

@@ -20,14 +20,20 @@
// Using a wrapper to ensure the result is properly captured // Using a wrapper to ensure the result is properly captured
var extractionResult = (function() { var extractionResult = (function() {
try { try {
// Check for "No results found" message using exact selector // Check for "No results found" message using very flexible selector
const noDataElement = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn'); const noDataElement = document.querySelector('[class*="noData"]') ||
document.querySelector('[class*="error"]') ||
(document.body.innerText && document.body.innerText.includes('No results'));
if (noDataElement) { if (noDataElement) {
return { status: 'no_results', ticker: null, sector: null, exchange: null }; return { status: 'no_results', ticker: null, sector: null, exchange: null };
} }
// Find the results table using exact selector // Find the results table using most flexible selector possible
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table'); // Try multiple strategies to find the table
const table = document.querySelector('table') ||
document.querySelector('[role="table"]') ||
document.querySelector('.table') ||
document.querySelector('#main-content-wrapper > section > section[class*="container"] > div[class*="tableContainer"] > div > table');
if (!table) { if (!table) {
return { status: 'no_results', ticker: null, sector: null, exchange: null }; return { status: 'no_results', ticker: null, sector: null, exchange: null };
} }

View File

@@ -1,4 +1,4 @@
// src/corporate/yahoo.rs - UPDATED WITH DATA INTEGRITY FIXES // src/corporate/yahoo.rs
use super::{types::*, helpers::*, page_validation::*}; use super::{types::*, helpers::*, page_validation::*};
use crate::{scraper::webdriver::*, util::{directories::DataPaths}}; use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
use crate::logger; use crate::logger;
@@ -63,18 +63,23 @@ impl YahooTickerResult {
} }
} }
/// UPDATED: Scrape company details with full validation and shutdown support /// Scrape company details with full validation and shutdown support
pub async fn scrape_company_details_by_isin( pub async fn scrape_company_details_by_isin(
pool: &Arc<ChromeDriverPool>, pool: &Arc<ChromeDriverPool>,
isin: &str, isin: &str,
shutdown_flag: &Arc<AtomicBool>, shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<YahooCompanyDetails>> { ) -> anyhow::Result<Option<YahooCompanyData>> {
// Check shutdown before starting // Check shutdown before starting
if shutdown_flag.load(Ordering::SeqCst) { if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await; logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
return Ok(None); return Ok(None);
} }
if pool.should_perform_hard_reset() {
logger::log_warn("HARD_RESET_REQUIRED detected before starting ISIN scrape").await;
return Err(anyhow!("HARD_RESET_REQUIRED"));
}
let isin_owned = isin.to_string(); let isin_owned = isin.to_string();
let shutdown_clone = Arc::clone(shutdown_flag); let shutdown_clone = Arc::clone(shutdown_flag);
let url = format!("https://finance.yahoo.com/lookup/?s={}", isin); let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
@@ -118,13 +123,20 @@ pub async fn scrape_company_details_by_isin(
} }
} }
// Additional content validation // Additional content validation - look for table or noData element anywhere on page
let page_ready: bool = client let page_ready: bool = client
.execute( .execute(
r#" r#"
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table'); // Try multiple selector strategies
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn'); const table = document.querySelector('table') ||
return !!(table || noData); document.querySelector('[role="table"]') ||
document.querySelector('.table');
const noData = document.querySelector('[class*="noData"]') ||
document.querySelector('[class*="error"]') ||
document.body.innerText.includes('No results');
const hasContent = !!(table || noData);
console.log('Page ready check - table:', !!table, 'noData:', !!noData, 'hasContent:', hasContent);
return hasContent;
"#, "#,
vec![], vec![],
) )
@@ -162,7 +174,7 @@ pub async fn scrape_company_details_by_isin(
async fn extract_company_details_validated( async fn extract_company_details_validated(
client: &Client, client: &Client,
isin: &str, isin: &str,
) -> Result<Option<YahooCompanyDetails>> { ) -> Result<Option<YahooCompanyData>> {
// Double-check URL is still correct before extraction // Double-check URL is still correct before extraction
let current_url = client.current_url().await?; let current_url = client.current_url().await?;
if !current_url.as_str().contains(isin) { if !current_url.as_str().contains(isin) {
@@ -197,8 +209,8 @@ async fn extract_company_details_validated(
pub async fn extract_company_details( pub async fn extract_company_details(
client: &Client, client: &Client,
_isin: &str, _isin: &str,
) -> Result<Option<YahooCompanyDetails>> { ) -> Result<Option<YahooCompanyData>> {
// Wait for page to load - look for either the table or the no-data element // Wait for page to load - look for either the table or the no-data element using simple selectors
let wait_result: Result<Result<bool, anyhow::Error>> = timeout( let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
TokioDuration::from_secs(30), TokioDuration::from_secs(30),
async { async {
@@ -206,9 +218,14 @@ pub async fn extract_company_details(
let has_content: bool = client let has_content: bool = client
.execute( .execute(
r#" r#"
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table'); // Use flexible selectors that don't depend on exact DOM structure
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn'); const table = document.querySelector('table') ||
return !!(table || noData); document.querySelector('[role="table"]') ||
document.querySelector('.table');
const noData = document.querySelector('[class*="noData"]') ||
document.querySelector('[class*="error"]');
const hasContent = !!(table || noData);
return hasContent;
"#, "#,
vec![], vec![],
) )
@@ -274,7 +291,7 @@ pub async fn extract_company_details(
)).await; )).await;
} }
Ok(Some(YahooCompanyDetails { Ok(Some(YahooCompanyData {
ticker, ticker,
sector: extraction.sector, sector: extraction.sector,
exchange: extraction.exchange, exchange: extraction.exchange,
@@ -298,9 +315,11 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
let content = tokio::fs::read_to_string(companies_file).await?; let content = tokio::fs::read_to_string(companies_file).await?;
let mut tickers = Vec::new(); let mut tickers = Vec::new();
for line in content.lines() { for line in content.lines() {
let company: CompanyCrossPlatformInfo = serde_json::from_str(line)?; let company: CompanyData = serde_json::from_str(line)?;
for (_isin, ticker_vec) in company.isin_tickers_map { if let Some(isin_tickers_map) = company.isin_tickers_map {
tickers.extend(ticker_vec); for (_isin, ticker_vec) in isin_tickers_map {
tickers.extend(ticker_vec);
}
} }
} }
Ok(tickers) Ok(tickers)
@@ -309,9 +328,9 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
pub async fn fetch_earnings_with_pool( pub async fn fetch_earnings_with_pool(
pool: &Arc<ChromeDriverPool>, pool: &Arc<ChromeDriverPool>,
ticker: &str, ticker: &str,
) -> anyhow::Result<Vec<CompanyEvent>> { ) -> anyhow::Result<Vec<CompanyEventData>> {
let ticker = ticker.to_string(); let ticker = ticker.to_string();
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker); let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker);
let ticker_cloned = ticker.clone(); let ticker_cloned = ticker.clone();
@@ -324,7 +343,7 @@ pub async fn fetch_earnings_with_pool(
}).await }).await
} }
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> { pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEventData>> {
// Wait for the table to load // Wait for the table to load
let table = client let table = client
.wait() .wait()
@@ -398,7 +417,7 @@ pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Ve
None None
}; };
events.push(CompanyEvent { events.push(CompanyEventData {
ticker: ticker.to_string(), ticker: ticker.to_string(),
date, date,
time, time,

View File

@@ -2,7 +2,9 @@
pub mod types; pub mod types;
pub mod scraper; pub mod scraper;
pub mod storage; pub mod storage;
pub mod update;
pub mod helpers; pub mod helpers;
pub mod update;
pub mod yahoo_update_forex;
pub use update::run_full_update; pub use update::run_full_update;

View File

@@ -8,7 +8,30 @@ const EXTRACTION_JS: &str = include_str!("extraction_script.js");
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> { pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?; client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
dismiss_overlays(client).await?;
Ok(())
}
pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
for _ in 0..10 {
let removed: bool = client
.execute(
r#"(() => {
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
if (iframe && iframe.parentNode) {
iframe.parentNode.removeChild(iframe);
return true;
}
return false;
})()"#,
vec![],
)
.await?
.as_bool()
.unwrap_or(false);
if removed { break; }
sleep(Duration::from_millis(500)).await;
}
Ok(()) Ok(())
} }

View File

@@ -8,7 +8,6 @@ use chrono::{NaiveDate, Datelike};
use std::collections::HashMap; use std::collections::HashMap;
use serde_json; use serde_json;
const CHUNK_SIZE: usize = 500; // Process 500 events at a time
const MAX_EVENTS_PER_FILE: usize = 3000; const MAX_EVENTS_PER_FILE: usize = 3000;
pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> { pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> {

View File

@@ -1,12 +1,13 @@
// src/economic/update.rs // src/economic/update.rs
use super::{scraper::*, storage::*, helpers::*, types::*}; use super::{scraper::*, storage::*, helpers::*, types::*};
use crate::check_shutdown;
use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger}; use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
use chrono::{Local}; use chrono::{Local};
use std::sync::Arc; use std::sync::{Arc, atomic::{AtomicBool}};
use std::collections::HashMap; use std::collections::HashMap;
/// Runs the full update for economic data using streaming to minimize memory usage /// Runs the full update for economic data using streaming to minimize memory usage
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> { pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>, shutdown_flag: &Arc<AtomicBool>) -> anyhow::Result<()> {
let paths = DataPaths::new(".")?; let paths = DataPaths::new(".")?;
logger::log_info("Economic Update: Initializing...").await; logger::log_info("Economic Update: Initializing...").await;
@@ -14,17 +15,22 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string(); let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
let end_date = config.target_end_date(); let end_date = config.target_end_date();
logger::log_info("=== Economic Update ===").await;
check_shutdown!(shutdown_flag);
// Step 1: Build lightweight index instead of loading all events // Step 1: Build lightweight index instead of loading all events
logger::log_info("Economic Update: Building event index...").await; logger::log_info("Step 1: Building event index...").await;
let chunks = scan_existing_chunks(&paths).await?; let chunks = scan_existing_chunks(&paths).await?;
let event_index = build_event_index(&chunks).await?; let event_index = build_event_index(&chunks).await?;
logger::log_info(&format!(" Economic Update: Indexed {} events from {} chunks",
logger::log_info(&format!("Economic Update: Indexed {} events from {} chunks",
event_index.len(), chunks.len())).await; event_index.len(), chunks.len())).await;
check_shutdown!(shutdown_flag);
// Step 2: Determine start date // Step 2: Determine start date
let start_date = if event_index.is_empty() { let start_date = if event_index.is_empty() {
logger::log_warn("Economic Update: No existing events found, starting from config date").await; logger::log_warn("Step 2: No existing events found, starting from config date").await;
config.economic_start_date.clone() config.economic_start_date.clone()
} else { } else {
// Find the latest date in the index // Find the latest date in the index
@@ -35,7 +41,7 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
.unwrap_or(today_str.clone()); .unwrap_or(today_str.clone());
if max_date >= today_str { if max_date >= today_str {
logger::log_info("Economic Update: Events exist for today, starting from today").await; logger::log_info(" Events exist for today, starting from today").await;
today_str.clone() today_str.clone()
} else { } else {
let next = chrono::NaiveDate::parse_from_str(&max_date, "%Y-%m-%d") let next = chrono::NaiveDate::parse_from_str(&max_date, "%Y-%m-%d")
@@ -43,34 +49,37 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
.and_then(|d| d.succ_opt()) .and_then(|d| d.succ_opt())
.map(|d| d.format("%Y-%m-%d").to_string()) .map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or(today_str.clone()); .unwrap_or(today_str.clone());
logger::log_info(&format!("Economic Update: Resuming from: {}", next)).await; logger::log_info(&format!(" Resuming from: {}", next)).await;
next next
} }
}; };
logger::log_info(&format!("Economic Update: Scraping events from {}{}", start_date, end_date)).await; check_shutdown!(shutdown_flag);
// Step 3: Scrape new events in batches // Step 3: Scrape new events in batches
logger::log_info(&format!("Step 3: Scraping events from {}{}", start_date, end_date)).await;
let new_events = scrape_all_economic_events(&start_date, &end_date, pool).await?; let new_events = scrape_all_economic_events(&start_date, &end_date, pool).await?;
logger::log_info(&format!(" Scraped {} new events", new_events.len())).await;
logger::log_info(&format!("Economic Update: Scraped {} new events", new_events.len())).await; check_shutdown!(shutdown_flag);
// Step 4: Process events in streaming fashion // Step 4: Process events in streaming fashion
logger::log_info(&format!("Step 4: Detecting changes")).await;
let (changes, updated_events) = process_events_streaming(&chunks, &new_events, &today_str).await?; let (changes, updated_events) = process_events_streaming(&chunks, &new_events, &today_str).await?;
logger::log_info(&format!(" Detected {} changes", changes.len())).await;
logger::log_info(&format!("Economic Update: Detected {} changes", changes.len())).await;
if !changes.is_empty() { if !changes.is_empty() {
logger::log_info(&format!("Economic Update: Saving {} changes to log", changes.len())).await; logger::log_info(&format!(" Saving {} changes to log", changes.len())).await;
save_changes(&paths, &changes).await?; save_changes(&paths, &changes).await?;
logger::log_info("Economic Update: Changes saved successfully").await; logger::log_info(" Changes saved successfully").await;
} }
// Step 5: Save consolidated events check_shutdown!(shutdown_flag);
logger::log_info(&format!("Economic Update: Saving {} total events to chunks", updated_events.len())).await;
save_optimized_chunks(&paths, updated_events).await?; // Step 5: Save consolidated events
logger::log_info(&format!("Step 5: Saving {} total events to chunks", updated_events.len())).await;
save_optimized_chunks(&paths, updated_events).await?;
logger::log_info(&format!(" ✓ Economic update complete — {} changes detected", changes.len())).await;
logger::log_info(&format!("✓ Economic update complete — {} changes detected", changes.len())).await;
Ok(()) Ok(())
} }
@@ -183,7 +192,7 @@ pub fn process_batch(
let mut changes = Vec::new(); let mut changes = Vec::new();
let mut removed = std::collections::HashSet::new(); let mut removed = std::collections::HashSet::new();
let identity_map = build_identity_lookup(existing); //let identity_map = build_identity_lookup(existing);
let date_map = build_date_event_lookup(existing); let date_map = build_date_event_lookup(existing);
for new in new_events { for new in new_events {

View File

@@ -0,0 +1,477 @@
// src/forex/update_forex.rs
use crate::config::Config;
use crate::util::directories::DataPaths;
use crate::util::integrity::{DataStage, StateManager, directory_reference};
use crate::util::logger;
use crate::scraper::yahoo::{YahooClientPool};
use crate::corporate::types::*;
use std::result::Result::Ok;
use chrono::{TimeZone, Utc};
use std::collections::HashSet;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::fs::{OpenOptions};
use tokio::io::{AsyncWriteExt};
use futures::stream::{FuturesUnordered, StreamExt};
use serde_json::json;
use tokio::sync::mpsc;
/// Currency information
#[derive(Debug, Clone)]
struct CurrencyPair {
code: String, // e.g., "EUR", "JPY"
name: String, // e.g., "Euro", "Japanese Yen"
yahoo_symbol: String, // e.g., "USDEUR=X", "USDJPY=X"
}
impl CurrencyPair {
fn new(code: &str, name: &str) -> Self {
Self {
code: code.to_string(),
name: name.to_string(),
yahoo_symbol: format!("USD{}=X", code),
}
}
}
/// Get list of currency pairs to fetch (USD as base currency)
fn get_currency_pairs() -> Vec<CurrencyPair> {
vec![
CurrencyPair::new("EUR", "Euro"),
CurrencyPair::new("TRY", "Turkish Lira"),
CurrencyPair::new("CHF", "Swiss Franc"),
CurrencyPair::new("SEK", "Swedish Krona"),
CurrencyPair::new("TWD", "New Taiwan Dollar"),
CurrencyPair::new("AUD", "Australian Dollar"),
CurrencyPair::new("GBP", "British Pound"), // Fixed: GBp -> GBP
CurrencyPair::new("NOK", "Norwegian Krone"),
CurrencyPair::new("CAD", "Canadian Dollar"),
CurrencyPair::new("CZK", "Czech Koruna"),
CurrencyPair::new("SGD", "Singapore Dollar"),
CurrencyPair::new("ISK", "Icelandic Króna"),
CurrencyPair::new("ZAR", "South African Rand"), // Fixed: ZAc -> ZAR
CurrencyPair::new("JPY", "Japanese Yen"),
CurrencyPair::new("PLN", "Polish Złoty"),
CurrencyPair::new("DKK", "Danish Krone"),
CurrencyPair::new("HKD", "Hong Kong Dollar"),
CurrencyPair::new("ILS", "Israeli Shekel"), // Fixed: ILA -> ILS
CurrencyPair::new("RON", "Romanian Leu"),
CurrencyPair::new("KWD", "Kuwaiti Dinar"), // Fixed: KWF -> KWD
]
}
/// Yahoo Collect Foreign Exchange Charts WITH ABORT-SAFE INCREMENTAL PERSISTENCE
///
/// # Features
/// - Graceful shutdown (abort-safe)
/// - Task panic isolation (tasks fail independently)
/// - Crash-safe persistence (checkpoint + log with fsync)
/// - Smart skip logic (only process incomplete data)
/// - Uses pending queue instead of retry mechanism
///
/// # Persistence Strategy
/// - Checkpoint: fx_rates_collected.jsonl (atomic state)
/// - Log: fx_rates_updates.log (append-only updates)
/// - On restart: Load checkpoint + replay log
/// - Periodic checkpoints (every 10 currencies)
/// - Batched fsync (every 5 writes or 10 seconds)
pub async fn collect_fx_rates(
paths: &DataPaths,
_config: &Config,
yahoo_pool: Arc<YahooClientPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<usize> {
// Configuration constants
const CHECKPOINT_INTERVAL: usize = 10;
const FSYNC_BATCH_SIZE: usize = 5;
const FSYNC_INTERVAL_SECS: u64 = 10;
const CONCURRENCY_LIMIT: usize = 10; // Limit parallel fetch tasks
let data_path = paths.data_dir();
// File paths
let output_path = data_path.join("economic").join("currency");
let log_path = data_path.join("fx_rates_updates.log");
let manager = StateManager::new(paths.integrity_dir()).await?;
let step_name = "yahoo_fx_rate_collection_completed";
let content_reference = directory_reference(&output_path,
Some(vec![
"*/chart/*.jsonl".to_string(), // Main pattern for events data
"*/chart/data.jsonl".to_string(), // Specific pattern (more precise)
]),
Some(vec![
"*.log".to_string(), // Exclude log files
"*.tmp".to_string(), // Exclude temp files
"*.bak".to_string(), // Exclude backup files
]),
);
if manager.is_step_valid(step_name).await? {
logger::log_info(" FX rates collection already completed").await;
let count = count_collected_currencies(paths).await?;
logger::log_info(&format!(" ✓ Found {} currencies with chart data", count)).await;
return Ok(count);
}
let entry = manager.create_entry(
step_name.to_string(),
content_reference.clone(),
DataStage::Data,
).await?;
logger::log_info(" Updating missing forex data...").await;
// === RECOVERY PHASE: Track collected currencies ===
let mut collected_currencies: HashSet<String> = HashSet::new();
if log_path.exists() {
logger::log_info("Loading FX rates collection progress from log...").await;
let log_content = tokio::fs::read_to_string(&log_path).await?;
for line in log_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<serde_json::Value>(line) {
Ok(entry) => {
if let Some(code) = entry.get("currency_code").and_then(|v| v.as_str()) {
if entry.get("status").and_then(|v| v.as_str()) == Some("collected") {
collected_currencies.insert(code.to_string());
}
}
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
logger::log_info(&format!("Loaded {} collected currencies from log", collected_currencies.len())).await;
}
// Get all currency pairs
let currency_pairs = get_currency_pairs();
let total_currencies = currency_pairs.len();
logger::log_info(&format!("Found {} currency pairs to collect", total_currencies)).await;
// Filter currencies that need collection
let pending_pairs: Vec<CurrencyPair> = currency_pairs
.into_iter()
.filter(|pair| !collected_currencies.contains(&pair.code))
.collect();
let pending_count = pending_pairs.len();
logger::log_info(&format!(
" {} already collected, {} pending",
collected_currencies.len(),
pending_count
)).await;
if pending_count == 0 {
logger::log_info(" ✓ All currencies already collected").await;
manager.mark_valid(entry).await?;
return Ok(collected_currencies.len());
}
// === PROCESSING PHASE: Collect FX rates ===
// Shared counters
let processed_count = Arc::new(AtomicUsize::new(collected_currencies.len()));
let success_count = Arc::new(AtomicUsize::new(collected_currencies.len()));
let failed_count = Arc::new(AtomicUsize::new(0));
// Log writer channel with batching and fsync
let (log_tx, mut log_rx) = mpsc::channel::<LogCommand>(1000);
// Spawn log writer task
let log_writer_handle = {
let log_path = log_path.clone();
let processed_count = Arc::clone(&processed_count);
let total_currencies = total_currencies;
tokio::spawn(async move {
let mut log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)
.await
.expect("Failed to open log file");
let mut write_count = 0;
let mut last_fsync = tokio::time::Instant::now();
while let Some(cmd) = log_rx.recv().await {
match cmd {
LogCommand::Write(entry) => {
let json_line = serde_json::to_string(&entry).expect("Serialization failed");
log_file.write_all(json_line.as_bytes()).await.expect("Write failed");
log_file.write_all(b"\n").await.expect("Write failed");
write_count += 1;
// Batched fsync
if write_count >= FSYNC_BATCH_SIZE
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS
{
log_file.flush().await.expect("Flush failed");
log_file.sync_all().await.expect("Fsync failed");
write_count = 0;
last_fsync = tokio::time::Instant::now();
}
}
LogCommand::Checkpoint => {
// Force fsync on checkpoint
log_file.flush().await.expect("Flush failed");
log_file.sync_all().await.expect("Fsync failed");
write_count = 0;
last_fsync = tokio::time::Instant::now();
let current = processed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
" Checkpoint: {}/{} currencies processed",
current, total_currencies
)).await;
}
LogCommand::Shutdown => {
// Final fsync before shutdown
log_file.flush().await.expect("Flush failed");
log_file.sync_all().await.expect("Fsync failed");
break;
}
}
}
})
};
// Process currencies concurrently with task panic isolation
let mut tasks = FuturesUnordered::new();
let mut pending_iter = pending_pairs.into_iter();
let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT));
// Initial batch of tasks
for _ in 0..CONCURRENCY_LIMIT.min(pending_count) {
if let Some(pair) = pending_iter.next() {
let task = spawn_collection_task(
pair,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
);
tasks.push(task);
}
}
// Process tasks as they complete and spawn new ones
let mut checkpoint_counter = 0;
while let Some(_result) = tasks.next().await {
// Check for shutdown
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown signal received, stopping FX collection").await;
break;
}
// Spawn new task if more pending
if let Some(pair) = pending_iter.next() {
let task = spawn_collection_task(
pair,
Arc::clone(&yahoo_pool),
paths.clone(),
Arc::clone(&processed_count),
Arc::clone(&success_count),
Arc::clone(&failed_count),
log_tx.clone(),
Arc::clone(&semaphore),
Arc::clone(shutdown_flag),
);
tasks.push(task);
}
// Periodic checkpoint
checkpoint_counter += 1;
if checkpoint_counter % CHECKPOINT_INTERVAL == 0 {
let _ = log_tx.send(LogCommand::Checkpoint).await;
}
}
// Signal shutdown to log writer
let _ = log_tx.send(LogCommand::Shutdown).await;
// Wait for log writer to finish
let _ = log_writer_handle.await;
// Final statistics
let final_success = success_count.load(Ordering::SeqCst);
let final_failed = failed_count.load(Ordering::SeqCst);
logger::log_info(&format!(
" FX collection complete: {} succeeded, {} failed",
final_success, final_failed
)).await;
// Mark as complete if not shutdown
if !shutdown_flag.load(Ordering::SeqCst) {
manager.mark_valid(entry).await?;
}
Ok(final_success)
}
/// Spawn a collection task with panic isolation
fn spawn_collection_task(
pair: CurrencyPair,
yahoo_pool: Arc<YahooClientPool>,
paths: DataPaths,
processed_count: Arc<AtomicUsize>,
success_count: Arc<AtomicUsize>,
failed_count: Arc<AtomicUsize>,
log_tx: mpsc::Sender<LogCommand>,
semaphore: Arc<tokio::sync::Semaphore>,
shutdown_flag: Arc<AtomicBool>,
) -> tokio::task::JoinHandle<()> {
tokio::spawn(async move {
// Acquire semaphore permit
let _permit = semaphore.acquire().await.expect("Semaphore closed");
// Check shutdown before processing
if shutdown_flag.load(Ordering::SeqCst) {
return;
}
// Perform collection (panic-isolated)
let result = collect_currency_chart(&pair, &yahoo_pool, &paths).await;
// Update counters
processed_count.fetch_add(1, Ordering::SeqCst);
let status = match result {
Ok(_) => {
success_count.fetch_add(1, Ordering::SeqCst);
logger::log_info(&format!(
" ✓ Collected {} ({})",
pair.code, pair.name
)).await;
"collected"
}
Err(e) => {
failed_count.fetch_add(1, Ordering::SeqCst);
logger::log_warn(&format!(
" ✗ Failed to collect {} ({}): {}",
pair.code, pair.name, e
)).await;
"failed"
}
};
// Log result
let log_entry = json!({
"currency_code": pair.code,
"currency_name": pair.name,
"yahoo_symbol": pair.yahoo_symbol,
"status": status,
"timestamp": Utc::now().to_rfc3339(),
});
let _ = log_tx.send(LogCommand::Write(log_entry)).await;
})
}
/// Collect chart data for a single currency pair
async fn collect_currency_chart(
pair: &CurrencyPair,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> anyhow::Result<()> {
// Get historical data from year 2000 to now
let now = Utc::now().timestamp();
let start_2000 = Utc
.with_ymd_and_hms(2000, 1, 1, 0, 0, 0)
.unwrap()
.timestamp();
// Fetch chart data from Yahoo
let chart_data = yahoo_pool.get_chart_data(
&pair.yahoo_symbol,
"1d", // Daily interval
start_2000,
now,
).await?;
// Validate we got data
if chart_data.quotes.is_empty() {
return Err(anyhow::anyhow!(
"No chart data available for {} ({})",
pair.code,
pair.yahoo_symbol
));
}
// Save chart data to currency directory
save_currency_chart(paths, &pair.code, &chart_data).await?;
Ok(())
}
/// Save currency chart data to filesystem
async fn save_currency_chart(
paths: &DataPaths,
currency_code: &str,
chart_data: &ChartData,
) -> anyhow::Result<()> {
use tokio::fs;
// Create directory structure: data/economic/currency/{code}/chart/
let economic_dir = paths.data_dir().join("economic");
let currency_dir = economic_dir.join("currency").join(currency_code);
let chart_dir = currency_dir.join("chart");
fs::create_dir_all(&chart_dir).await?;
// Write chart data to data.jsonl
let data_path = chart_dir.join("data.jsonl");
let json_line = serde_json::to_string(chart_data)?;
let mut file = fs::File::create(&data_path).await?;
file.write_all(json_line.as_bytes()).await?;
file.write_all(b"\n").await?;
file.flush().await?;
file.sync_all().await?; // Ensure data is persisted
Ok(())
}
/// Count collected currencies (currencies with chart data)
async fn count_collected_currencies(paths: &DataPaths) -> anyhow::Result<usize> {
let currency_dir = paths.data_dir().join("economic").join("currency");
if !currency_dir.exists() {
return Ok(0);
}
let mut count = 0;
let mut entries = tokio::fs::read_dir(&currency_dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
let chart_file = path.join("chart").join("data.jsonl");
if chart_file.exists() {
count += 1;
}
}
}
Ok(count)
}
/// Log command enum
enum LogCommand {
Write(serde_json::Value),
Checkpoint,
Shutdown,
}

View File

@@ -15,3 +15,7 @@ pub use monitoring::{init_monitoring, ConfigSnapshot, MonitoringEvent};
pub use config::Config; pub use config::Config;
pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask}; pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask};
pub use util::logger; pub use util::logger;
pub use util::macros;
pub use scraper::yahoo::{
YahooClient, YahooClientPool, QuoteSummaryModule, QuoteSummary, SearchResult
};

View File

@@ -1,46 +1,182 @@
// src/main.rs use web_scraper::util::integrity::StateManager;
// src/main.rs - Cleaned up version with extracted helpers
use web_scraper::{*, scraper, economic, corporate}; use web_scraper::{*, scraper, corporate};
use crate::check_shutdown;
use anyhow::Result; use anyhow::{Result};
use web_scraper::config::Config; use web_scraper::config::Config;
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers}; use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
use scraper::webdriver::ChromeDriverPool; use scraper::webdriver::ChromeDriverPool;
use util::directories::DataPaths; use util::directories::DataPaths;
use util::{logger, opnv}; use util::{logger, opnv};
use std::fs::{OpenOptions};
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
use std::process::Command; use std::process::Command;
use std::time::{Duration, Instant};
#[tokio::main] // ============================================================================
async fn main() -> Result<()> { // HELPER FUNCTIONS - Extracted to reduce duplication
let output = if cfg!(target_os = "windows") { // ============================================================================
Command::new("cmd")
/// Start Docker Desktop on Windows
async fn start_docker_desktop() {
if cfg!(target_os = "windows") {
let _ = Command::new("cmd")
.args(["/C", "docker desktop start"]) .args(["/C", "docker desktop start"])
.output() .output();
.expect("failed to execute process") }
} else { }
Command::new("sh")
.arg("-c")
.arg("echo hello")
.output()
.expect("failed to execute process")
};
let _start_docker_desktop = output.stdout;
cleanup_all_proxy_containers().await.ok(); /// Shutdown ChromeDriver pool with error handling
async fn shutdown_chrome_pool(pool: &ChromeDriverPool) {
logger::log_info("Shutting down ChromeDriver pool...").await;
match pool.shutdown().await {
Ok(()) => logger::log_info("✓ ChromeDriver pool shut down successfully").await,
Err(e) => logger::log_error(&format!("✗ Pool shutdown error: {}", e)).await,
}
}
let config = match Config::load() { /// Shutdown Docker VPN proxy pool with error handling
Ok(cfg) => cfg, async fn shutdown_proxy_pool(proxy_pool: &DockerVpnProxyPool) {
Err(_) => { logger::log_info("Stopping Docker VPN proxy containers...").await;
eprintln!("Using default configuration"); match proxy_pool.shutdown().await {
Config::default() Ok(()) => logger::log_info("✓ All Docker VPN containers stopped").await,
Err(e) => logger::log_error(&format!("✗ Proxy shutdown error: {}", e)).await,
}
}
/// Force-kill Chrome and ChromeDriver processes (Windows only)
#[cfg(target_os = "windows")]
async fn force_kill_chrome_processes() {
logger::log_info("Force-killing any remaining Chrome processes...").await;
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await;
}
#[cfg(not(target_os = "windows"))]
async fn force_kill_chrome_processes() {
// No-op on non-Windows platforms
}
/// Verify Chrome processes are cleaned up (Windows only)
#[cfg(target_os = "windows")]
async fn verify_chrome_cleanup() {
if let Ok(output) = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chrome.exe"])
.output()
.await
{
let stdout = String::from_utf8_lossy(&output.stdout);
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
if chrome_count > 0 {
logger::log_warn(&format!("⚠️ {} Chrome processes still running after cleanup!", chrome_count)).await;
} else {
logger::log_info("✓ All Chrome processes cleaned up").await;
} }
}; }
}
let paths = DataPaths::new(".")?; #[cfg(not(target_os = "windows"))]
async fn verify_chrome_cleanup() {
// No-op on non-Windows platforms
}
// Initialize monitoring system /// Complete cleanup sequence: shutdown pools, cleanup containers, kill processes
async fn perform_full_cleanup(
pool: &ChromeDriverPool,
proxy_pool: Option<&DockerVpnProxyPool>,
) {
shutdown_chrome_pool(pool).await;
if let Some(pp) = proxy_pool {
shutdown_proxy_pool(pp).await;
cleanup_all_proxy_containers().await.ok();
}
force_kill_chrome_processes().await;
}
/// Create temporary ChromeDriver pool, fetch VPN credentials, and cleanup
async fn fetch_vpn_credentials_with_temp_pool(
config: &Config,
paths: &DataPaths,
monitoring_handle: &monitoring::MonitoringHandle,
) -> Result<Option<Arc<DockerVpnProxyPool>>> {
logger::log_info("VPN Rotation Enabled Fetching latest VPNBook configs").await;
// Create temp pool
logger::log_info("Creating temporary ChromeDriver pool for VPN credential fetch...").await;
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
None,
config,
Some(monitoring_handle.clone())
).await?);
// Fetch credentials
logger::log_info("Fetching VPNBook credentials...").await;
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
// Cleanup temp pool
logger::log_info("Shutting down temporary pool...").await;
match temp_pool.shutdown().await {
Ok(()) => logger::log_info("✓ Temporary pool shut down successfully").await,
Err(e) => {
logger::log_error(&format!("✗ Temp pool shutdown error: {}", e)).await;
force_kill_chrome_processes().await;
}
}
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// Count VPN servers and create proxy pool
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
.filter(|e| e.as_ref().unwrap().path().is_dir())
.count();
if server_count == 0 {
logger::log_warn("No VPN servers found continuing without VPN").await;
return Ok(None);
}
logger::log_info(&format!("Found {} VPN servers starting Docker proxy containers", server_count)).await;
let number_proxy_instances = config.proxy_instances_per_certificate.unwrap_or(1);
let proxy_pool = Arc::new(DockerVpnProxyPool::new(
paths.cache_openvpn_dir(),
username,
password,
number_proxy_instances
).await?);
logger::log_info(&format!("All {} Docker proxy containers started and ready", proxy_pool.num_proxies())).await;
// Emit proxy connection events
for i in 0..proxy_pool.num_proxies() {
if let Some(proxy_info) = proxy_pool.get_proxy_info(i) {
monitoring_handle.emit(monitoring::MonitoringEvent::ProxyConnected {
container_name: proxy_info.container_name.clone(),
ip_address: proxy_info.ip_address.clone(),
port: proxy_info.port,
});
}
}
Ok(Some(proxy_pool))
}
/// Initialize monitoring system
async fn initialize_monitoring(
config: &Config,
paths: &DataPaths,
) -> Result<(monitoring::MonitoringHandle, tokio::task::JoinHandle<()>)> {
let config_snapshot = ConfigSnapshot { let config_snapshot = ConfigSnapshot {
max_parallel_instances: config.max_parallel_instances, max_parallel_instances: config.max_parallel_instances,
max_tasks_per_instance: config.max_tasks_per_instance, max_tasks_per_instance: config.max_tasks_per_instance,
@@ -50,13 +186,12 @@ async fn main() -> Result<()> {
max_retry_attempts: config.max_retry_attempts, max_retry_attempts: config.max_retry_attempts,
}; };
let (monitoring_handle, _monitoring_task) = init_monitoring( let (monitoring_handle, monitoring_task) = init_monitoring(
config_snapshot, config_snapshot,
paths.logs_dir().to_path_buf(), paths.logs_dir().to_path_buf(),
3030, // Dashboard port 3030,
).await?; ).await?;
// Emit pool initialization event
monitoring_handle.emit(monitoring::MonitoringEvent::PoolInitialized { monitoring_handle.emit(monitoring::MonitoringEvent::PoolInitialized {
pool_size: config.max_parallel_instances, pool_size: config.max_parallel_instances,
with_proxy: config.enable_vpn_rotation, with_proxy: config.enable_vpn_rotation,
@@ -65,129 +200,160 @@ async fn main() -> Result<()> {
logger::log_info("Monitoring dashboard available at http://localhost:3030").await; logger::log_info("Monitoring dashboard available at http://localhost:3030").await;
Ok((monitoring_handle, monitoring_task))
}
/// Setup Ctrl+C handler for graceful shutdown
fn setup_shutdown_handler(
shutdown_flag: Arc<AtomicBool>,
pool: Arc<ChromeDriverPool>,
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
) {
tokio::spawn(async move {
tokio::signal::ctrl_c().await.ok();
logger::log_info("Ctrl+C received shutting down gracefully...").await;
shutdown_flag.store(true, Ordering::SeqCst);
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
perform_full_cleanup(&pool, proxy_pool.as_deref()).await;
logger::log_info("Shutdown complete").await;
std::process::exit(0);
});
}
fn format_duration(duration: Duration) -> String {
let total_seconds = duration.as_secs();
let days = total_seconds / 86400;
let hours = (total_seconds % 86400) / 3600;
let minutes = (total_seconds % 3600) / 60;
let seconds = total_seconds % 60;
format!("{:02}::{:02}::{:02}::{:02}", days, hours, minutes, seconds)
}
async fn create_state_file(paths: &DataPaths) -> Result<()> {
let integrity_path = paths.integrity_dir().join("state.jsonl");
// Use OpenOptions to create the file only if it doesn't exist
OpenOptions::new()
.create(true) // Create if it doesn't exist
.write(true) // Ensure we can write to the file
.open(&integrity_path)?;
logger::log_info(&format!("Checked or created file: {}", integrity_path.display())).await;
Ok(())
}
async fn visualize_checkpoint_dependencies(paths: &DataPaths) -> Result<()> {
// Add more detailed error handling
match StateManager::new(
paths.integrity_dir(),
).await {
Ok(manager) => {
logger::log_info("✓ Dependency configuration loaded successfully").await;
manager.print_dependency_graph();
let dot = manager.get_dependency_config().to_dot();
let dot_path = paths.integrity_dir().join("checkpoint_dependencies.dot");
std::fs::write(&dot_path, dot)?;
logger::log_info(&format!("✓ DOT file written to: {}", dot_path.display())).await;
Ok(())
}
Err(e) => {
logger::log_error(&format!("✗ Failed to load dependency config: {}", e)).await;
Err(e)
}
}
}
// ============================================================================
// MAIN FUNCTION - Simplified with extracted helpers
// ============================================================================
#[tokio::main]
async fn main() -> Result<()> {
// Initial setup
let start = Instant::now();
let paths = DataPaths::new(".")?;
start_docker_desktop().await;
cleanup_all_proxy_containers().await.ok();
create_state_file(&paths).await.ok();
visualize_checkpoint_dependencies(&paths).await.ok();
let config = Config::load().unwrap_or_else(|_| {
eprintln!("Using default configuration");
Config::default()
});
// Initialize monitoring
let (monitoring_handle, _monitoring_task) = initialize_monitoring(&config, &paths).await?;
// Initialize debug logger
logger::init_debug_logger(paths.logs_dir()).await.ok(); logger::init_debug_logger(paths.logs_dir()).await.ok();
logger::log_info("=== Event Backtest Engine Started ===").await; logger::log_info("=== Economic Webscraper Started ===").await;
logger::log_info(&format!( logger::log_info(&format!(
"Config → parallel_instances: {}, task_limit: {} vpn_rotation: {}", "Config → parallel_instances: {}, task_limit: {}, vpn_rotation: {}, proxy_instances_per_certificate: {:?}",
config.max_parallel_instances, config.max_parallel_instances,
config.max_tasks_per_instance, config.max_tasks_per_instance,
config.enable_vpn_rotation config.enable_vpn_rotation,
config.proxy_instances_per_certificate
)).await; )).await;
// Simple shutdown flag
let shutdown_flag = Arc::new(AtomicBool::new(false)); let shutdown_flag = Arc::new(AtomicBool::new(false));
// === Step 1: Fetch VPNBook configs === // Fetch VPN credentials and setup proxy pool if enabled
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation { let proxy_pool = if config.enable_vpn_rotation {
logger::log_info("VPN Rotation Enabled Fetching latest VPNBook configs").await; fetch_vpn_credentials_with_temp_pool(&config, &paths, &monitoring_handle).await?
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(None, &config, Some(monitoring_handle.clone())).await?);
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
.filter(|e| e.as_ref().unwrap().path().is_dir())
.count();
if server_count == 0 {
logger::log_warn("No VPN servers found continuing without VPN").await;
None
} else {
logger::log_info(&format!("Found {} VPN servers starting Docker proxy containers", server_count)).await;
let pp = Arc::new(DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password).await?);
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
for i in 0..pp.num_proxies() {
if let Some(proxy_info) = pp.get_proxy_info(i) {
monitoring_handle.emit(monitoring::MonitoringEvent::ProxyConnected {
container_name: proxy_info.container_name.clone(),
ip_address: proxy_info.ip_address.clone(),
port: proxy_info.port,
});
}
}
Some(pp)
}
} else { } else {
logger::log_info("VPN rotation disabled using direct connection").await; logger::log_info("VPN rotation disabled using direct connection").await;
None None
}; };
// === Step 2: Initialize ChromeDriver pool === // Create main ChromeDriver pool
let pool_size = config.max_parallel_instances; logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", config.max_parallel_instances)).await;
let task_limit = config.max_tasks_per_instance;
logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", pool_size)).await; let pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
proxy_pool.clone(),
&config,
Some(monitoring_handle.clone())
).await?);
let pool = Arc::new( logger::log_info(&format!("ChromeDriver pool ready with {} instances", config.max_parallel_instances)).await;
if task_limit > 0 {
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await? // Setup Ctrl+C handler
} else { setup_shutdown_handler(
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await? Arc::clone(&shutdown_flag),
} Arc::clone(&pool),
proxy_pool.clone(),
); );
logger::log_info(&format!("ChromeDriver pool ready with {} instances", pool_size)).await; // Run scraping jobs
check_shutdown!(&shutdown_flag);
// === Step 3: Ctrl+C handler ===
{
let shutdown_flag_clone = Arc::clone(&shutdown_flag);
let pool_clone = Arc::clone(&pool);
let proxy_clone = proxy_pool.clone();
tokio::spawn(async move {
tokio::signal::ctrl_c().await.ok();
logger::log_info("Ctrl+C received shutting down gracefully...").await;
// Set flag first
shutdown_flag_clone.store(true, Ordering::SeqCst);
// Wait a bit for tasks to notice
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// Cleanup
if let Err(e) = (&*pool_clone).shutdown().await {
logger::log_error(&format!("Error during pool shutdown: {}", e)).await;
}
if let Some(pp) = proxy_clone {
if let Err(e) = pp.shutdown().await {
logger::log_warn(&format!("Failed to stop Docker containers: {}", e)).await;
} else {
logger::log_info("All Docker VPN containers stopped").await;
}
}
let _ = cleanup_all_proxy_containers().await;
std::process::exit(0);
});
}
// === Step 4: Run scraping jobs ===
logger::log_info("--- Starting ECONOMIC data update ---").await; logger::log_info("--- Starting ECONOMIC data update ---").await;
economic::run_full_update(&config, &pool).await?; economic::run_full_update(&config, &pool, &shutdown_flag).await?;
logger::log_info("Economic update completed").await; logger::log_info("Economic update completed").await;
if !shutdown_flag.load(Ordering::SeqCst) { check_shutdown!(&shutdown_flag);
logger::log_info("--- Starting CORPORATE data update ---").await;
corporate::run_full_update(&config, &pool, &shutdown_flag).await?;
logger::log_info("Corporate update completed").await;
}
// === Step 5: Final cleanup === logger::log_info("--- Starting CORPORATE data update ---").await;
if !shutdown_flag.load(Ordering::SeqCst) { corporate::run_full_update(&config, &pool, &shutdown_flag).await?;
logger::log_info("Shutting down ChromeDriver pool...").await; logger::log_info("Corporate update completed").await;
pool.shutdown().await?;
if let Some(pp) = proxy_pool { check_shutdown!(&shutdown_flag);
logger::log_info("Stopping Docker VPN proxy containers...").await;
pp.shutdown().await?;
cleanup_all_proxy_containers().await.ok();
}
logger::log_info("=== Application finished successfully ===").await; // Final cleanup if not already shutting down
} perform_full_cleanup(&pool, proxy_pool.as_deref()).await;
verify_chrome_cleanup().await;
logger::log_info(&format!("=== Application finished after {} ===", format_duration(start.elapsed()))).await;
logger::log_info("=== Application finished successfully ===").await;
Ok(()) Ok(())
} }

View File

@@ -250,6 +250,35 @@
text-transform: uppercase; text-transform: uppercase;
} }
/* Yahoo Stats */
.yahoo-stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 12px;
margin-top: 10px;
}
.yahoo-stat-box {
background: #2a3a4a;
padding: 15px;
border-radius: 5px;
text-align: center;
border-left: 4px solid #FF9800;
}
.yahoo-stat-value {
font-size: 28px;
font-weight: bold;
color: #FF9800;
margin-bottom: 5px;
}
.yahoo-stat-label {
font-size: 11px;
color: #aaa;
text-transform: uppercase;
}
/* Logs */ /* Logs */
.log-container { .log-container {
max-height: 300px; max-height: 300px;
@@ -339,6 +368,31 @@
.pulse { .pulse {
animation: pulse 2s infinite; animation: pulse 2s infinite;
} }
/* Yahoo Client Box */
.yahoo-client-box {
background: #2a3a4a;
border: 2px solid #FF9800;
border-radius: 5px;
padding: 12px;
display: flex;
gap: 0;
overflow: hidden;
}
.yahoo-client-side {
flex: 1;
padding: 12px;
}
.yahoo-client-side.left {
background: #3a4a5a;
border-right: 1px solid #555;
}
.yahoo-client-side.right {
background: #2a3a4a;
}
</style> </style>
</head> </head>
<body> <body>
@@ -363,6 +417,13 @@
<div class="instance-grid" id="instances"></div> <div class="instance-grid" id="instances"></div>
</div> </div>
<!-- Yahoo API Section -->
<div class="section">
<div class="section-title">📈 YAHOO API METRICS</div>
<div class="yahoo-stats-grid" id="yahoo-stats"></div>
<div class="instance-grid" id="yahoo-clients"></div>
</div>
<!-- Global Metrics Section --> <!-- Global Metrics Section -->
<div class="section"> <div class="section">
<div class="section-title">📊 GLOBAL METRICS</div> <div class="section-title">📊 GLOBAL METRICS</div>
@@ -432,6 +493,8 @@
updateConfig(state.config); updateConfig(state.config);
updateInstances(state.instances); updateInstances(state.instances);
updateGlobalStats(state.global); updateGlobalStats(state.global);
updateYahooStats(state.global);
updateYahooClients(state.yahoo_clients);
updateLogs(state.logs); updateLogs(state.logs);
} }
@@ -480,6 +543,10 @@
? ((inst.success_count / inst.total_requests) * 100).toFixed(1) ? ((inst.success_count / inst.total_requests) * 100).toFixed(1)
: '0.0'; : '0.0';
const yahooSuccessRate = inst.yahoo_requests > 0
? ((inst.yahoo_success / inst.yahoo_requests) * 100).toFixed(1)
: '0.0';
return ` return `
<div class="instance-box ${statusClass}"> <div class="instance-box ${statusClass}">
<div class="instance-side"> <div class="instance-side">
@@ -511,6 +578,16 @@
${successRate}% ${successRate}%
</span> </span>
</div> </div>
<div class="metric-row">
<span class="metric-label">Yahoo Requests</span>
<span class="metric-value">${inst.yahoo_requests}</span>
</div>
<div class="metric-row">
<span class="metric-label">Yahoo Rate</span>
<span class="metric-value ${yahooSuccessRate < 50 ? 'danger' : yahooSuccessRate < 80 ? 'warning' : ''}">
${yahooSuccessRate}%
</span>
</div>
<div class="metric-row"> <div class="metric-row">
<span class="metric-label">Last Activity</span> <span class="metric-label">Last Activity</span>
<span class="metric-value">${inst.last_activity}</span> <span class="metric-value">${inst.last_activity}</span>
@@ -556,6 +633,115 @@
}).join(''); }).join('');
} }
function updateYahooStats(global) {
const container = document.getElementById('yahoo-stats');
const yahooSuccessRate = global.total_yahoo_requests > 0
? ((global.successful_yahoo_requests / global.total_yahoo_requests) * 100).toFixed(1)
: '0.0';
container.innerHTML = `
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.total_yahoo_requests || 0}</div>
<div class="yahoo-stat-label">Total Requests</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${yahooSuccessRate}%</div>
<div class="yahoo-stat-label">Success Rate</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.successful_yahoo_requests || 0}</div>
<div class="yahoo-stat-label">Successful</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.failed_yahoo_requests || 0}</div>
<div class="yahoo-stat-label">Failed</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.yahoo_client_count || 0}</div>
<div class="yahoo-stat-label">Active Clients</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.yahoo_batch_requests || 0}</div>
<div class="yahoo-stat-label">Batch Requests</div>
</div>
<div class="yahoo-stat-box">
<div class="yahoo-stat-value">${global.yahoo_session_renewals || 0}</div>
<div class="yahoo-stat-label">Session Renewals</div>
</div>
`;
}
function updateYahooClients(yahooClients) {
const container = document.getElementById('yahoo-clients');
if (!yahooClients || yahooClients.length === 0) {
container.innerHTML = '<div style="text-align: center; padding: 40px; color: #666;">No Yahoo clients available</div>';
return;
}
container.innerHTML = yahooClients.map(client => {
const successRate = client.requests_total > 0
? ((client.requests_successful / client.requests_total) * 100).toFixed(1)
: '0.0';
return `
<div class="yahoo-client-box">
<div class="yahoo-client-side left">
<div class="side-header">
📊 Yahoo Client #${client.instance_id}
${client.has_proxy ? '🔗' : '🌐'}
</div>
<div class="metric-row">
<span class="metric-label">Total Requests</span>
<span class="metric-value">${client.requests_total}</span>
</div>
<div class="metric-row">
<span class="metric-label">Success / Fail</span>
<span class="metric-value">${client.requests_successful} / ${client.requests_failed}</span>
</div>
<div class="metric-row">
<span class="metric-label">Success Rate</span>
<span class="metric-value ${successRate < 50 ? 'danger' : successRate < 80 ? 'warning' : ''}">
${successRate}%
</span>
</div>
<div class="metric-row">
<span class="metric-label">Current / Max</span>
<span class="metric-value ${client.current_requests >= client.max_requests ? 'danger' : ''}">
${client.current_requests} / ${client.max_requests}
</span>
</div>
<div class="metric-row">
<span class="metric-label">Last Activity</span>
<span class="metric-value">${client.last_activity}</span>
</div>
</div>
<div class="yahoo-client-side right">
${client.proxy_info ? `
<div class="side-header">🔗 ${client.proxy_info.container_name}</div>
<div class="metric-row">
<span class="metric-label">IP Address</span>
<span class="metric-value">${client.proxy_info.ip_address}</span>
</div>
<div class="metric-row">
<span class="metric-label">Port</span>
<span class="metric-value">${client.proxy_info.port}</span>
</div>
<div class="metric-row">
<span class="metric-label">Status</span>
<span class="metric-value">${client.proxy_info.status}</span>
</div>
` : `
<div class="no-proxy">
${client.has_proxy ? '⚠️' : '🌐'}<br>
${client.has_proxy ? 'Proxy Not Connected' : 'Direct Connection'}
</div>
`}
</div>
</div>
`;
}).join('');
}
function updateGlobalStats(global) { function updateGlobalStats(global) {
const container = document.getElementById('global-stats'); const container = document.getElementById('global-stats');

View File

@@ -23,6 +23,11 @@ pub enum MonitoringEvent {
status: InstanceStatusChange, status: InstanceStatusChange,
}, },
InstanceSelected {
instance_id: usize,
half: usize,
},
// Task execution // Task execution
TaskStarted { TaskStarted {
instance_id: usize, instance_id: usize,
@@ -87,6 +92,45 @@ pub enum MonitoringEvent {
reason: String, reason: String,
}, },
// Yahoo API events
YahooRequestStarted {
instance_id: usize,
endpoint: String,
symbol: Option<String>,
},
YahooRequestCompleted {
instance_id: usize,
success: bool,
duration_ms: u64,
error: Option<String>,
},
YahooBatchRequestStarted {
count: usize,
symbols: Vec<String>,
endpoint: String,
},
YahooBatchRequestCompleted {
successful: usize,
failed: usize,
total: usize,
duration_ms: u64,
},
YahooClientCreated {
instance_id: usize,
has_proxy: bool,
max_requests: u32,
},
YahooClientReset {
instance_id: usize,
previous_requests: u32,
reason: String,
},
// Logging // Logging
LogMessage { LogMessage {
level: LogLevel, level: LogLevel,

View File

@@ -9,6 +9,7 @@ pub struct DashboardState {
pub config: ConfigSnapshot, pub config: ConfigSnapshot,
pub instances: Vec<InstanceMetrics>, pub instances: Vec<InstanceMetrics>,
pub proxies: Vec<ProxyMetrics>, pub proxies: Vec<ProxyMetrics>,
pub yahoo_clients: Vec<YahooClientMetrics>,
pub global: GlobalMetrics, pub global: GlobalMetrics,
pub logs: Vec<LogEntry>, pub logs: Vec<LogEntry>,
} }
@@ -38,6 +39,14 @@ pub struct InstanceMetrics {
pub failure_count: usize, pub failure_count: usize,
pub connected_proxy: Option<ProxyInfo>, pub connected_proxy: Option<ProxyInfo>,
pub last_activity: String, // Timestamp pub last_activity: String, // Timestamp
pub yahoo_requests: usize,
pub yahoo_success: usize,
pub yahoo_failures: usize,
pub yahoo_success_rate: f64,
pub yahoo_current_requests: u32,
pub yahoo_max_requests: u32,
pub yahoo_last_endpoint: Option<String>,
pub yahoo_last_symbol: Option<String>,
} }
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -75,6 +84,20 @@ pub struct ProxyMetrics {
pub instances_using: Vec<usize>, pub instances_using: Vec<usize>,
} }
/// Metrics for a Yahoo client
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct YahooClientMetrics {
pub instance_id: usize,
pub requests_total: usize,
pub requests_successful: usize,
pub requests_failed: usize,
pub current_requests: u32,
pub max_requests: u32,
pub has_proxy: bool,
pub last_activity: String,
pub proxy_info: Option<ProxyInfo>,
}
/// Global pool metrics /// Global pool metrics
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GlobalMetrics { pub struct GlobalMetrics {
@@ -88,6 +111,13 @@ pub struct GlobalMetrics {
pub bot_detection_hits: usize, pub bot_detection_hits: usize,
pub proxy_failures: usize, pub proxy_failures: usize,
pub uptime_seconds: u64, pub uptime_seconds: u64,
pub total_yahoo_requests: usize,
pub successful_yahoo_requests: usize,
pub failed_yahoo_requests: usize,
pub yahoo_success_rate: f64,
pub yahoo_batch_requests: usize,
pub yahoo_session_renewals: usize,
pub yahoo_client_count: usize,
} }
/// Log entry for display in dashboard /// Log entry for display in dashboard
@@ -111,6 +141,7 @@ pub enum LogLevel {
pub struct MonitoringState { pub struct MonitoringState {
pub instances: HashMap<usize, InstanceState>, pub instances: HashMap<usize, InstanceState>,
pub proxies: HashMap<String, ProxyState>, pub proxies: HashMap<String, ProxyState>,
pub yahoo_clients: HashMap<usize, YahooClientState>,
pub global: GlobalState, pub global: GlobalState,
pub start_time: Instant, pub start_time: Instant,
} }
@@ -128,6 +159,13 @@ pub struct InstanceState {
pub failure_count: usize, pub failure_count: usize,
pub connected_proxy: Option<ProxyInfo>, pub connected_proxy: Option<ProxyInfo>,
pub last_activity: Instant, pub last_activity: Instant,
pub yahoo_requests: usize,
pub yahoo_success: usize,
pub yahoo_failures: usize,
pub yahoo_current_requests: u32,
pub yahoo_max_requests: u32,
pub yahoo_last_endpoint: Option<String>,
pub yahoo_last_symbol: Option<String>,
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@@ -139,6 +177,19 @@ pub struct ProxyState {
pub instances_using: Vec<usize>, pub instances_using: Vec<usize>,
} }
#[derive(Debug, Clone)]
pub struct YahooClientState {
pub instance_id: usize,
pub requests_total: usize,
pub requests_successful: usize,
pub requests_failed: usize,
pub current_requests: u32,
pub max_requests: u32,
pub has_proxy: bool,
pub last_activity: Instant,
pub proxy_info: Option<ProxyInfo>,
}
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct GlobalState { pub struct GlobalState {
pub total_requests: usize, pub total_requests: usize,
@@ -149,6 +200,12 @@ pub struct GlobalState {
pub navigation_timeouts: usize, pub navigation_timeouts: usize,
pub bot_detection_hits: usize, pub bot_detection_hits: usize,
pub proxy_failures: usize, pub proxy_failures: usize,
pub total_yahoo_requests: usize,
pub successful_yahoo_requests: usize,
pub failed_yahoo_requests: usize,
pub yahoo_batch_requests: usize,
pub yahoo_session_renewals: usize,
pub yahoo_client_count: usize,
} }
impl MonitoringState { impl MonitoringState {
@@ -156,6 +213,7 @@ impl MonitoringState {
Self { Self {
instances: HashMap::new(), instances: HashMap::new(),
proxies: HashMap::new(), proxies: HashMap::new(),
yahoo_clients: HashMap::new(),
global: GlobalState { global: GlobalState {
total_requests: 0, total_requests: 0,
successful_requests: 0, successful_requests: 0,
@@ -165,6 +223,12 @@ impl MonitoringState {
navigation_timeouts: 0, navigation_timeouts: 0,
bot_detection_hits: 0, bot_detection_hits: 0,
proxy_failures: 0, proxy_failures: 0,
total_yahoo_requests: 0,
successful_yahoo_requests: 0,
failed_yahoo_requests: 0,
yahoo_batch_requests: 0,
yahoo_session_renewals: 0,
yahoo_client_count: 0,
}, },
start_time: Instant::now(), start_time: Instant::now(),
} }
@@ -175,18 +239,34 @@ impl MonitoringState {
let instances: Vec<InstanceMetrics> = self let instances: Vec<InstanceMetrics> = self
.instances .instances
.values() .values()
.map(|inst| InstanceMetrics { .map(|inst| {
id: inst.id, let yahoo_success_rate = if inst.yahoo_success + inst.yahoo_failures > 0 {
status: inst.status.clone(), (inst.yahoo_success as f64 / (inst.yahoo_success + inst.yahoo_failures) as f64) * 100.0
current_task: inst.current_task.clone(), } else {
tasks_current_session: inst.tasks_current_session, 0.0
tasks_max: inst.tasks_max, };
session_requests: inst.session_requests,
total_requests: inst.total_requests, InstanceMetrics {
success_count: inst.success_count, id: inst.id,
failure_count: inst.failure_count, status: inst.status.clone(),
connected_proxy: inst.connected_proxy.clone(), current_task: inst.current_task.clone(),
last_activity: format_timestamp(inst.last_activity), tasks_current_session: inst.tasks_current_session,
tasks_max: inst.tasks_max,
session_requests: inst.session_requests,
total_requests: inst.total_requests,
success_count: inst.success_count,
failure_count: inst.failure_count,
connected_proxy: inst.connected_proxy.clone(),
last_activity: format_timestamp(inst.last_activity),
yahoo_requests: inst.yahoo_requests,
yahoo_success: inst.yahoo_success,
yahoo_failures: inst.yahoo_failures,
yahoo_success_rate,
yahoo_current_requests: inst.yahoo_current_requests,
yahoo_max_requests: inst.yahoo_max_requests,
yahoo_last_endpoint: inst.yahoo_last_endpoint.clone(),
yahoo_last_symbol: inst.yahoo_last_symbol.clone(),
}
}) })
.collect(); .collect();
@@ -202,12 +282,34 @@ impl MonitoringState {
}) })
.collect(); .collect();
let yahoo_clients: Vec<YahooClientMetrics> = self
.yahoo_clients
.values()
.map(|client| YahooClientMetrics {
instance_id: client.instance_id,
requests_total: client.requests_total,
requests_successful: client.requests_successful,
requests_failed: client.requests_failed,
current_requests: client.current_requests,
max_requests: client.max_requests,
has_proxy: client.has_proxy,
last_activity: format_timestamp(client.last_activity),
proxy_info: client.proxy_info.clone(),
})
.collect();
let success_rate = if self.global.total_requests > 0 { let success_rate = if self.global.total_requests > 0 {
(self.global.successful_requests as f64 / self.global.total_requests as f64) * 100.0 (self.global.successful_requests as f64 / self.global.total_requests as f64) * 100.0
} else { } else {
0.0 0.0
}; };
let yahoo_success_rate = if self.global.total_yahoo_requests > 0 {
(self.global.successful_yahoo_requests as f64 / self.global.total_yahoo_requests as f64) * 100.0
} else {
0.0
};
let global = GlobalMetrics { let global = GlobalMetrics {
total_requests: self.global.total_requests, total_requests: self.global.total_requests,
successful_requests: self.global.successful_requests, successful_requests: self.global.successful_requests,
@@ -219,12 +321,20 @@ impl MonitoringState {
bot_detection_hits: self.global.bot_detection_hits, bot_detection_hits: self.global.bot_detection_hits,
proxy_failures: self.global.proxy_failures, proxy_failures: self.global.proxy_failures,
uptime_seconds: self.start_time.elapsed().as_secs(), uptime_seconds: self.start_time.elapsed().as_secs(),
total_yahoo_requests: self.global.total_yahoo_requests,
successful_yahoo_requests: self.global.successful_yahoo_requests,
failed_yahoo_requests: self.global.failed_yahoo_requests,
yahoo_success_rate,
yahoo_batch_requests: self.global.yahoo_batch_requests,
yahoo_session_renewals: self.global.yahoo_session_renewals,
yahoo_client_count: self.global.yahoo_client_count,
}; };
DashboardState { DashboardState {
config, config,
instances, instances,
proxies, proxies,
yahoo_clients,
global, global,
logs, logs,
} }
@@ -233,7 +343,6 @@ impl MonitoringState {
fn format_timestamp(instant: Instant) -> String { fn format_timestamp(instant: Instant) -> String {
use chrono::Local; use chrono::Local;
// This is a placeholder - in real impl we'd track actual wall-clock time
Local::now().format("%H:%M:%S").to_string() Local::now().format("%H:%M:%S").to_string()
} }

View File

@@ -76,6 +76,13 @@ impl MonitoringService {
failure_count: 0, failure_count: 0,
connected_proxy: proxy.clone(), connected_proxy: proxy.clone(),
last_activity: Instant::now(), last_activity: Instant::now(),
yahoo_requests: 0,
yahoo_success: 0,
yahoo_failures: 0,
yahoo_current_requests: 0,
yahoo_max_requests: 0,
yahoo_last_endpoint: None,
yahoo_last_symbol: None,
}, },
); );
@@ -107,6 +114,10 @@ impl MonitoringService {
} }
} }
MonitoringEvent::InstanceSelected { instance_id, half } => {
self.log_info(format!("Instance #{} selected (half {})", instance_id, half)).await;
}
MonitoringEvent::TaskStarted { instance_id, url } => { MonitoringEvent::TaskStarted { instance_id, url } => {
let mut state = self.state.write().await; let mut state = self.state.write().await;
if let Some(inst) = state.instances.get_mut(&instance_id) { if let Some(inst) = state.instances.get_mut(&instance_id) {
@@ -189,9 +200,9 @@ impl MonitoringService {
if let Some(inst) = state.instances.get(&instance_id) { if let Some(inst) = state.instances.get(&instance_id) {
Some(SessionSummary { Some(SessionSummary {
instance_id, instance_id,
session_start: "N/A".to_string(), // We'd need to track this session_start: "N/A".to_string(),
session_end: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(), session_end: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
duration_seconds: 0, // We'd need to track session start time duration_seconds: 0,
total_requests: old_request_count, total_requests: old_request_count,
successful_requests: inst.success_count, successful_requests: inst.success_count,
failed_requests: inst.failure_count, failed_requests: inst.failure_count,
@@ -279,6 +290,154 @@ impl MonitoringService {
self.log_info(format!("Pool rotation triggered: {}", reason)).await; self.log_info(format!("Pool rotation triggered: {}", reason)).await;
} }
// Yahoo API Events
MonitoringEvent::YahooRequestStarted { instance_id, endpoint, symbol } => {
let mut state = self.state.write().await;
// Update global Yahoo stats
state.global.total_yahoo_requests += 1;
// Update instance stats
if let Some(inst) = state.instances.get_mut(&instance_id) {
inst.yahoo_requests += 1;
inst.yahoo_current_requests += 1;
inst.yahoo_last_endpoint = Some(endpoint.clone());
inst.yahoo_last_symbol = symbol.clone();
inst.last_activity = Instant::now();
}
// Update Yahoo client stats
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
client.requests_total += 1;
client.current_requests += 1;
client.last_activity = Instant::now();
}
self.log_info(format!(
"YahooClient[{}] started request: {} {}",
instance_id,
endpoint,
symbol.unwrap_or_else(|| "search".to_string())
)).await;
}
MonitoringEvent::YahooRequestCompleted { instance_id, success, duration_ms, error } => {
let mut state = self.state.write().await;
// Update global Yahoo stats
if success {
state.global.successful_yahoo_requests += 1;
} else {
state.global.failed_yahoo_requests += 1;
}
// Update instance stats
if let Some(inst) = state.instances.get_mut(&instance_id) {
inst.yahoo_current_requests = inst.yahoo_current_requests.saturating_sub(1);
if success {
inst.yahoo_success += 1;
} else {
inst.yahoo_failures += 1;
}
inst.last_activity = Instant::now();
}
// Update Yahoo client stats
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
client.current_requests = client.current_requests.saturating_sub(1);
if success {
client.requests_successful += 1;
} else {
client.requests_failed += 1;
}
client.last_activity = Instant::now();
}
if success {
self.log_info(format!(
"YahooClient[{}] completed request in {}ms",
instance_id, duration_ms
)).await;
} else {
self.log_error(format!(
"YahooClient[{}] failed request in {}ms: {}",
instance_id,
duration_ms,
error.unwrap_or_else(|| "unknown error".to_string())
)).await;
}
}
MonitoringEvent::YahooBatchRequestStarted { count, symbols, endpoint } => {
let mut state = self.state.write().await;
state.global.yahoo_batch_requests += 1;
self.log_info(format!(
"Yahoo batch request started: {} symbols, endpoint: {}",
count, endpoint
)).await;
if !symbols.is_empty() {
self.log_debug(format!(
"Batch symbols: {}",
symbols.join(", ")
)).await;
}
}
MonitoringEvent::YahooBatchRequestCompleted { successful, failed, total, duration_ms } => {
let success_rate = if total > 0 {
(successful as f64 / total as f64) * 100.0
} else {
0.0
};
self.log_info(format!(
"Yahoo batch completed: {}/{} successful ({:.1}%) in {}ms",
successful, total, success_rate, duration_ms
)).await;
}
MonitoringEvent::YahooClientCreated { instance_id, has_proxy, max_requests } => {
let mut state = self.state.write().await;
state.global.yahoo_client_count += 1;
state.yahoo_clients.insert(
instance_id,
YahooClientState {
instance_id,
requests_total: 0,
requests_successful: 0,
requests_failed: 0,
current_requests: 0,
max_requests,
has_proxy,
last_activity: Instant::now(),
proxy_info: None,
},
);
self.log_info(format!(
"YahooClient[{}] created (proxy: {}, max requests: {})",
instance_id, has_proxy, max_requests
)).await;
}
MonitoringEvent::YahooClientReset { instance_id, previous_requests, reason } => {
let mut state = self.state.write().await;
state.global.yahoo_session_renewals += 1;
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
client.current_requests = 0;
client.last_activity = Instant::now();
}
self.log_info(format!(
"YahooClient[{}] reset (had {} requests, reason: {})",
instance_id, previous_requests, reason
)).await;
}
MonitoringEvent::LogMessage { level, message } => { MonitoringEvent::LogMessage { level, message } => {
match level { match level {
crate::monitoring::events::LogLevel::Info => self.log_info(message).await, crate::monitoring::events::LogLevel::Info => self.log_info(message).await,
@@ -313,6 +472,17 @@ impl MonitoringService {
}).await; }).await;
} }
async fn log_debug(&self, message: String) {
// Only log debug if DEBUG_LOGGING is enabled
if std::env::var("DEBUG_LOGGING").is_ok() {
self.add_log(LogEntry {
timestamp: Local::now().format("%H:%M:%S").to_string(),
level: super::metrics::LogLevel::Info,
message: format!("[DEBUG] {}", message),
}).await;
}
}
async fn add_log(&self, entry: LogEntry) { async fn add_log(&self, entry: LogEntry) {
let mut logs = self.logs.write().await; let mut logs = self.logs.write().await;
if logs.len() >= MAX_LOGS { if logs.len() >= MAX_LOGS {

View File

@@ -1,16 +1,26 @@
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use futures::future::join_all; use futures::future::join_all;
use std::{path::{Path, PathBuf}, time::Duration}; use std::{collections::HashSet, path::{Path, PathBuf}, sync::{Arc, RwLock}, time::Duration};
use tokio::{process::Command, time::{sleep}}; use tokio::{process::Command, time::{sleep}};
use walkdir::WalkDir; use walkdir::WalkDir;
pub struct DockerVpnProxyPool { pub struct DockerVpnProxyPool {
container_names: Vec<String>, container_names: Vec<String>,
proxy_ports: Vec<u16>, // e.g., [10801, 10802, ...] proxy_ports: Vec<u16>, // e.g., [10801, 10802, ...]
dead_proxies: Arc<RwLock<HashSet<usize>>>,
} }
impl DockerVpnProxyPool { impl DockerVpnProxyPool {
pub async fn new(ovpn_dir: &Path, username: String, password: String) -> Result<Self> { pub async fn new(
ovpn_dir: &Path,
username: String,
password: String,
instances_per_ovpn: usize,
) -> Result<Self> {
if instances_per_ovpn == 0 {
return Err(anyhow!("instances_per_ovpn must be at least 1"));
}
// Count hostnames (subdirs in ovpn_dir) // Count hostnames (subdirs in ovpn_dir)
let hostnames: Vec<_> = std::fs::read_dir(ovpn_dir)? let hostnames: Vec<_> = std::fs::read_dir(ovpn_dir)?
.filter_map(Result::ok) .filter_map(Result::ok)
@@ -23,14 +33,21 @@ impl DockerVpnProxyPool {
return Err(anyhow!("No VPN hostnames found in {:?}", ovpn_dir)); return Err(anyhow!("No VPN hostnames found in {:?}", ovpn_dir));
} }
crate::util::logger::log_info(&format!("Found {} VPN hostnames", num_servers)).await; // Calculate total containers: hostnames × instances_per_ovpn
let total_containers = num_servers * instances_per_ovpn;
let mut container_names = Vec::with_capacity(num_servers); crate::util::logger::log_info(&format!(
let mut proxy_ports = Vec::with_capacity(num_servers); "Found {} VPN hostnames × {} instances = {} total containers",
num_servers, instances_per_ovpn, total_containers
)).await;
let mut container_names = Vec::with_capacity(total_containers);
let mut proxy_ports = Vec::with_capacity(total_containers);
let base_port: u16 = 10800; let base_port: u16 = 10800;
let mut port_counter = 0u16;
// === STEP 1: Start ALL containers first === // === STEP 1: Start ALL containers first ===
for (i, hostname) in hostnames.iter().enumerate() { for hostname in hostnames.iter() {
// Pick tcp443.ovpn if exists, else first .ovpn // Pick tcp443.ovpn if exists, else first .ovpn
let hostname_dir = ovpn_dir.join(hostname); let hostname_dir = ovpn_dir.join(hostname);
let mut ovpn_path: Option<PathBuf> = None; let mut ovpn_path: Option<PathBuf> = None;
@@ -48,48 +65,58 @@ impl DockerVpnProxyPool {
let ovpn_path = ovpn_path.ok_or_else(|| anyhow!("No .ovpn found for {}", hostname))?; let ovpn_path = ovpn_path.ok_or_else(|| anyhow!("No .ovpn found for {}", hostname))?;
let name = format!("vpn-proxy-{}", i); // Spawn multiple instances for this .ovpn file
let port = base_port + i as u16 + 1; for instance_num in 0..instances_per_ovpn {
let name = format!("vpn-proxy-{}-{}", hostname, instance_num);
let port = base_port + port_counter + 1;
port_counter += 1;
// Clean up any existing container with the same name // Clean up any existing container with the same name
let _ = Command::new("docker") let _ = Command::new("docker")
.args(["rm", "-f", &name]) .args(["rm", "-f", &name])
.status() .status()
.await; .await;
// Run Docker container // Run Docker container
let status = Command::new("docker") let status = Command::new("docker")
.args([ .args([
"run", "-d", "run", "-d",
"--name", &name, "--name", &name,
"--cap-add=NET_ADMIN", "--cap-add=NET_ADMIN",
"--device", "/dev/net/tun", "--device", "/dev/net/tun",
"--sysctl", "net.ipv4.ip_forward=1", "--sysctl", "net.ipv4.ip_forward=1",
"-v", &format!("{}:/vpn/config.ovpn", ovpn_path.display()), "-v", &format!("{}:/vpn/config.ovpn", ovpn_path.display()),
"-e", &format!("VPN_USERNAME={}", username), "-e", &format!("VPN_USERNAME={}", username),
"-e", &format!("VPN_PASSWORD={}", password), "-e", &format!("VPN_PASSWORD={}", password),
"-p", &format!("{}:1080", port), "-p", &format!("{}:1080", port),
"rust-vpn-proxy", "rust-vpn-proxy",
]) ])
.status() .status()
.await .await
.context("Failed to run Docker")?; .context("Failed to run Docker")?;
if !status.success() { if !status.success() {
return Err(anyhow!("Docker run failed for {}", name)); return Err(anyhow!("Docker run failed for {}", name));
}
crate::util::logger::log_info(&format!(
"Started container {} on port {} (using {})",
name, port, ovpn_path.file_name().unwrap().to_string_lossy()
)).await;
container_names.push(name);
proxy_ports.push(port);
} }
crate::util::logger::log_info(&format!("Started container {} on port {} (waiting for VPN...)", name, port)).await;
container_names.push(name);
proxy_ports.push(port);
} }
// Brief pause to let containers start // Brief pause to let containers start
sleep(Duration::from_secs(8)).await; sleep(Duration::from_secs(8)).await;
crate::util::logger::log_info(&format!("All {} containers started, beginning health checks...", container_names.len())).await; crate::util::logger::log_info(&format!(
"All {} containers started, beginning health checks...",
container_names.len()
)).await;
// === STEP 2: Test ALL proxies in parallel with 10-second intervals === // === STEP 2: Test ALL proxies in parallel ===
let results = Self::test_all_proxies_parallel(&container_names, &proxy_ports).await; let results = Self::test_all_proxies_parallel(&container_names, &proxy_ports).await;
// Filter out failed containers // Filter out failed containers
@@ -100,8 +127,10 @@ impl DockerVpnProxyPool {
for (i, (container_name, port)) in container_names.into_iter().zip(proxy_ports.into_iter()).enumerate() { for (i, (container_name, port)) in container_names.into_iter().zip(proxy_ports.into_iter()).enumerate() {
match &results[i] { match &results[i] {
Ok(Some(ip)) => { Ok(Some(ip)) => {
crate::util::logger::log_info(&format!("✓ Container {} on port {} ready with IP: {}", crate::util::logger::log_info(&format!(
container_name, port, ip)).await; "✓ Container {} on port {} ready with IP: {}",
container_name, port, ip
)).await;
working_containers.push(container_name); working_containers.push(container_name);
working_ports.push(port); working_ports.push(port);
} }
@@ -113,14 +142,15 @@ impl DockerVpnProxyPool {
.ok() .ok()
.and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into()); .and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into());
crate::util::logger::log_error(&format!("✗ Container {} on port {} ready but IP detection failed. Logs: {:?}", crate::util::logger::log_error(&format!(
container_name, port, logs)).await; "✗ Container {} on port {} ready but IP detection failed. Logs: {:?}",
container_name, port, logs
)).await;
failed_count += 1; failed_count += 1;
// Clean up failed container // Clean up failed container
let _ = Self::cleanup_container(&container_name).await; let _ = Self::cleanup_container(&container_name).await;
} }
Err(e) => { Err(e) => {
// Get container logs to debug
let logs = Command::new("docker") let logs = Command::new("docker")
.args(["logs", "--tail", "20", &container_name]) .args(["logs", "--tail", "20", &container_name])
.output() .output()
@@ -128,8 +158,10 @@ impl DockerVpnProxyPool {
.ok() .ok()
.and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into()); .and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into());
crate::util::logger::log_error(&format!("✗ Container {} on port {} failed: {}. Logs: {:?}", crate::util::logger::log_error(&format!(
container_name, port, e, logs)).await; "✗ Container {} on port {} failed: {}. Logs: {:?}",
container_name, port, e, logs
)).await;
failed_count += 1; failed_count += 1;
// Clean up failed container // Clean up failed container
let _ = Self::cleanup_container(&container_name).await; let _ = Self::cleanup_container(&container_name).await;
@@ -138,19 +170,25 @@ impl DockerVpnProxyPool {
} }
if working_containers.is_empty() { if working_containers.is_empty() {
return Err(anyhow!("All {} VPN proxy containers failed to start", num_servers)); return Err(anyhow!("All {} VPN proxy containers failed to start", total_containers));
} }
crate::util::logger::log_info(&format!("Started {}/{} VPN proxy containers successfully", crate::util::logger::log_info(&format!(
working_containers.len(), num_servers)).await; "Started {}/{} VPN proxy containers successfully ({} hostnames × {} instances)",
working_containers.len(), total_containers, num_servers, instances_per_ovpn
)).await;
if failed_count > 0 { if failed_count > 0 {
crate::util::logger::log_warn(&format!("{} containers failed and were cleaned up", failed_count)).await; crate::util::logger::log_warn(&format!(
"{} containers failed and were cleaned up",
failed_count
)).await;
} }
Ok(Self { Ok(Self {
container_names: working_containers, container_names: working_containers,
proxy_ports: working_ports, proxy_ports: working_ports,
dead_proxies: Arc::new(RwLock::new(HashSet::new())),
}) })
} }
@@ -319,7 +357,7 @@ impl DockerVpnProxyPool {
pub fn get_proxy_url(&self, index: usize) -> String { pub fn get_proxy_url(&self, index: usize) -> String {
let port = self.proxy_ports[index % self.proxy_ports.len()]; let port = self.proxy_ports[index % self.proxy_ports.len()];
format!("socks5://localhost:{}", port) format!("socks5h://localhost:{}", port)
} }
pub fn num_proxies(&self) -> usize { pub fn num_proxies(&self) -> usize {
@@ -361,6 +399,69 @@ impl DockerVpnProxyPool {
pub fn get_container_name(&self, index: usize) -> Option<String> { pub fn get_container_name(&self, index: usize) -> Option<String> {
self.container_names.get(index).cloned() self.container_names.get(index).cloned()
} }
// Get a healthy proxy URL (skips dead proxies)
pub async fn get_healthy_proxy_url(&self, start_index: usize) -> Option<(usize, String)> {
let dead = match self.dead_proxies.read() {
Ok(value) => value,
Err(_) => return None,
};
let total = self.proxy_ports.len();
// Try up to 'total' proxies starting from start_index
for attempt in 0..total {
let index = (start_index + attempt) % total;
// Skip if dead
if dead.contains(&index) {
continue;
}
let port = self.proxy_ports[index];
return Some((index, format!("socks5h://localhost:{}", port)));
}
None
}
// Mark a proxy as dead
pub async fn mark_proxy_dead(&self, index: usize) -> Option<bool> {
// Acquire lock, perform mutation, and get values for logging
let (port, remaining, total) = {
let mut dead = match self.dead_proxies.write() {
Ok(value) => value,
Err(_) => return None,
};
dead.insert(index);
let port = self.proxy_ports.get(index).copied().unwrap_or(0);
let remaining = self.proxy_ports.len() - dead.len();
let total = self.proxy_ports.len();
// Lock is automatically dropped here when the scope ends
(port, remaining, total)
};
// Now we can await without holding the lock
crate::util::logger::log_warn(&format!(
"⚠ Marked proxy {} (port {}) as DEAD ({}/{} proxies remaining)",
index,
port,
remaining,
total
)).await;
Some(true)
}
// Get count of healthy proxies
pub async fn num_healthy_proxies(&self) -> Option<usize> {
let dead = match self.dead_proxies.read() {
Ok(value) => value,
Err(_) => return None,
};
Some(self.proxy_ports.len() - dead.len())
}
} }
pub async fn cleanup_all_proxy_containers() -> Result<()> { pub async fn cleanup_all_proxy_containers() -> Result<()> {

377
src/scraper/hard_reset.rs Normal file
View File

@@ -0,0 +1,377 @@
// src/scraper/hard_reset.rs - FIXED: Proper cleanup without Arc leaks
use std::sync::{Arc, atomic::{AtomicBool, AtomicUsize, Ordering}};
use crate::{ChromeDriverPool, Config, logger, scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers}, util::directories::DataPaths};
/// Simple error counter for triggering hard resets
pub struct HardResetController {
consecutive_errors: AtomicUsize,
}
impl HardResetController {
pub fn new() -> Self {
Self {
consecutive_errors: AtomicUsize::new(0),
}
}
/// Record success - resets counter
pub fn record_success(&self) {
self.consecutive_errors.store(0, Ordering::SeqCst);
}
/// Record error - returns new count
pub fn record_error(&self) -> usize {
self.consecutive_errors.fetch_add(1, Ordering::SeqCst) + 1
}
/// Reset counter
pub fn reset(&self) {
self.consecutive_errors.store(0, Ordering::SeqCst);
}
/// Get current count
pub fn get_count(&self) -> usize {
self.consecutive_errors.load(Ordering::SeqCst)
}
}
/// ✅ FIXED: Perform hard reset without Arc reference leaks
///
/// Key improvements:
/// 1. Don't clone old_pool - just shutdown through mutex guard
/// 2. Verify all processes killed before creating new pool
/// 3. Explicitly shutdown temp pools with error handling
/// 4. Add process counting/verification
pub async fn perform_hard_reset(
pool_mutex: &Arc<tokio::sync::Mutex<Arc<ChromeDriverPool>>>,
config: &Config,
paths: &DataPaths,
monitoring: &Option<crate::monitoring::MonitoringHandle>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<()> {
//let number_proxy_instances = config.proxy_instances_per_certificate.unwrap_or(1);
logger::log_error("🔴 STARTING HARD RESET SEQUENCE").await;
// Check if shutdown was requested
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown requested during hard reset, aborting").await;
return Ok(());
}
// ===== STEP 1: ACQUIRE POOL LOCK (NO CLONING!) =====
logger::log_info(" [1/12] Acquiring pool lock...").await;
let mut pool_guard = pool_mutex.lock().await;
// Get instance count before shutdown for verification
let old_instance_count = pool_guard.get_number_of_instances();
logger::log_info(&format!(" [1/12] Pool has {} instances", old_instance_count)).await;
// ===== STEP 2: SHUTDOWN OLD POOL (NO ARC CLONE!) =====
logger::log_info(" [2/12] Shutting down old pool (NO Arc clone)...").await;
// Shutdown through the Arc without cloning it
// This is safe because we hold the mutex lock
match pool_guard.shutdown().await {
Ok(()) => {
logger::log_info(" [2/12] ✓ Pool shutdown complete").await;
}
Err(e) => {
logger::log_error(&format!(" [2/12] ✗ Pool shutdown error: {}", e)).await;
// Continue anyway - we'll force-kill processes
}
}
// ===== STEP 3: FORCE-KILL ANY REMAINING CHROME PROCESSES =====
logger::log_info(" [3/12] Force-killing any remaining Chrome/ChromeDriver processes...").await;
#[cfg(target_os = "windows")]
{
// Kill all chrome.exe processes
let chrome_result = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
match chrome_result {
Ok(output) if output.status.success() => {
logger::log_info(" [3/12] ✓ Chrome processes killed").await;
}
_ => {
logger::log_info(" [3/12] ⊘ No Chrome processes found").await;
}
}
// Kill all chromedriver.exe processes
let chromedriver_result = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await;
match chromedriver_result {
Ok(output) if output.status.success() => {
logger::log_info(" [3/12] ✓ ChromeDriver processes killed").await;
}
_ => {
logger::log_info(" [3/12] ⊘ No ChromeDriver processes found").await;
}
}
}
#[cfg(not(target_os = "windows"))]
{
// Kill all chrome processes
let _ = tokio::process::Command::new("pkill")
.arg("chrome")
.output()
.await;
let _ = tokio::process::Command::new("pkill")
.arg("chromedriver")
.output()
.await;
logger::log_info(" [3/12] ✓ Force-killed Chrome/ChromeDriver").await;
}
// ===== STEP 4: SHUTDOWN PROXIES =====
logger::log_info(" [4/12] Shutting down proxy containers...").await;
cleanup_all_proxy_containers().await.ok();
// ===== STEP 5: WAIT FOR CLEANUP =====
logger::log_info(" [5/12] Waiting 30 seconds for cleanup...").await;
tokio::time::sleep(tokio::time::Duration::from_secs(30)).await;
// ===== STEP 6: VERIFY CLEANUP =====
logger::log_info(" [6/12] Verifying process cleanup...").await;
#[cfg(target_os = "windows")]
{
let check_chrome = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chrome.exe"])
.output()
.await;
if let Ok(output) = check_chrome {
let stdout = String::from_utf8_lossy(&output.stdout);
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
if chrome_count > 0 {
logger::log_warn(&format!(" [6/12] ⚠️ {} Chrome processes still running!", chrome_count)).await;
} else {
logger::log_info(" [6/12] ✓ No Chrome processes running").await;
}
}
}
// Check shutdown again
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown requested during cleanup, aborting reset").await;
return Ok(());
}
// ===== STEP 7: RECREATE PROXY POOL =====
logger::log_info(" [7/12] Recreating proxy pool...").await;
let new_proxy_pool = if config.enable_vpn_rotation {
match recreate_proxy_pool_with_fresh_credentials(config, paths, monitoring, shutdown_flag).await {
Ok(pool) => {
logger::log_info(&format!(
" [7/12] ✓ Proxy pool created with {} proxies",
pool.num_proxies()
)).await;
Some(pool)
}
Err(e) => {
logger::log_warn(&format!(
" [7/12] ⚠️ Proxy creation failed: {}. Continuing without proxies.",
e
)).await;
None
}
}
} else {
logger::log_info(" [7/12] ⊘ VPN rotation disabled, skipping proxy pool").await;
None
};
// ===== STEP 8: RECREATE CHROMEDRIVER POOL =====
logger::log_info(" [8/12] Recreating ChromeDriver pool...").await;
let new_pool = Arc::new(
ChromeDriverPool::new_with_proxy_and_task_limit(
new_proxy_pool,
config,
monitoring.clone(),
).await?
);
logger::log_info(&format!(
" [8/12] ✓ ChromeDriver pool created with {} instances",
new_pool.get_number_of_instances()
)).await;
// ===== STEP 9: RESET ERROR COUNTER =====
logger::log_info(" [9/12] Resetting error counter...").await;
new_pool.get_reset_controller().reset();
logger::log_info(" [9/12] ✓ Error counter cleared").await;
// ===== STEP 10: REPLACE POOL ATOMICALLY =====
logger::log_info(" [10/12] Activating new pool...").await;
*pool_guard = new_pool;
drop(pool_guard);
logger::log_info(" [10/12] ✓ New pool activated").await;
// ===== STEP 11: EMIT MONITORING EVENT =====
logger::log_info(" [11/12] Updating monitoring...").await;
if let Some(mon) = monitoring {
mon.emit(crate::monitoring::MonitoringEvent::PoolInitialized {
pool_size: config.max_parallel_instances,
with_proxy: config.enable_vpn_rotation,
with_rotation: config.max_tasks_per_instance > 0,
});
}
// ===== STEP 12: FINAL VERIFICATION =====
logger::log_info(" [12/12] Final verification...").await;
#[cfg(target_os = "windows")]
{
let check_chrome = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chrome.exe"])
.output()
.await;
if let Ok(output) = check_chrome {
let stdout = String::from_utf8_lossy(&output.stdout);
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
logger::log_info(&format!(" [12/12] Chrome processes: {}", chrome_count)).await;
}
let check_chromedriver = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chromedriver.exe"])
.output()
.await;
if let Ok(output) = check_chromedriver {
let stdout = String::from_utf8_lossy(&output.stdout);
let chromedriver_count = stdout.lines().filter(|line| line.contains("chromedriver.exe")).count();
logger::log_info(&format!(" [12/12] ChromeDriver processes: {}", chromedriver_count)).await;
}
}
logger::log_info("✅ HARD RESET COMPLETE").await;
Ok(())
}
/// ✅ FIXED: Recreate proxy pool with temp pool that's properly shut down
async fn recreate_proxy_pool_with_fresh_credentials(
config: &Config,
paths: &DataPaths,
monitoring: &Option<crate::monitoring::MonitoringHandle>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Arc<DockerVpnProxyPool>> {
let number_proxy_instances = config.proxy_instances_per_certificate.unwrap_or(1);
// Check shutdown
if shutdown_flag.load(Ordering::SeqCst) {
return Err(anyhow::anyhow!("Shutdown requested during proxy recreation"));
}
logger::log_info(" [7.1] Creating temporary ChromeDriver pool for credential fetch...").await;
// Create temporary pool WITHOUT proxy
let temp_pool = Arc::new(
ChromeDriverPool::new_with_proxy_and_task_limit(
None, // No proxy for temp pool
config,
monitoring.clone(),
).await?
);
logger::log_info(" [7.2] Fetching fresh VPNBook credentials...").await;
// Fetch fresh VPNBook credentials
let (username, password, _files) = crate::util::opnv::fetch_vpnbook_configs(
&temp_pool,
paths.cache_dir()
).await?;
logger::log_info(&format!(" [7.3] Got credentials → User: {}", username)).await;
// ✅ FIXED: Properly shutdown temp pool with error handling
logger::log_info(" [7.4] Shutting down temporary pool...").await;
match temp_pool.shutdown().await {
Ok(()) => {
logger::log_info(" [7.4] ✓ Temp pool shut down successfully").await;
}
Err(e) => {
logger::log_error(&format!(" [7.4] ✗ Temp pool shutdown error: {}", e)).await;
// Force-kill processes as backup
#[cfg(target_os = "windows")]
{
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await;
}
}
}
// Wait a moment for temp pool cleanup
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// Check shutdown again
if shutdown_flag.load(Ordering::SeqCst) {
return Err(anyhow::anyhow!("Shutdown requested during proxy recreation"));
}
// Check if we have VPN server configs
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
.filter(|e| e.as_ref().unwrap().path().is_dir())
.count();
if server_count == 0 {
return Err(anyhow::anyhow!("No VPN servers found after credential fetch"));
}
logger::log_info(&format!(
" [7.5] Found {} VPN servers → Creating proxy pool with {} instances per server...",
server_count,
number_proxy_instances
)).await;
// Create new proxy pool
let proxy_pool = Arc::new(
DockerVpnProxyPool::new(
paths.cache_openvpn_dir(),
username,
password,
number_proxy_instances,
).await?
);
logger::log_info(&format!(
" [7.6] ✓ Proxy pool ready with {} total proxies",
proxy_pool.num_proxies()
)).await;
// Emit proxy connected events for monitoring
if let Some(mon) = monitoring {
for i in 0..proxy_pool.num_proxies() {
if let Some(proxy_info) = proxy_pool.get_proxy_info(i) {
mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected {
container_name: proxy_info.container_name.clone(),
ip_address: proxy_info.ip_address.clone(),
port: proxy_info.port,
});
}
}
}
Ok(proxy_pool)
}

View File

@@ -1,3 +1,6 @@
pub mod webdriver; pub mod webdriver;
pub mod docker_vpn_proxy; pub mod docker_vpn_proxy;
pub mod helpers; pub mod helpers;
pub mod hard_reset;
pub mod yahoo;
pub mod openfigi;

367
src/scraper/openfigi.rs Normal file
View File

@@ -0,0 +1,367 @@
// src/scraper/openfigi.rs - STREAMING VERSION
// Key changes: Never load entire GLEIF CSV or FIGI maps into memory
use crate::util::directories::DataPaths;
use crate::util::logger;
use crate::corporate::{types::*};
use reqwest::Client as HttpClient;
use reqwest::header::{HeaderMap, HeaderValue};
use serde_json::{json, Value};
use std::path::Path;
use tokio::time::{sleep, Duration};
use tokio::fs as tokio_fs;
use anyhow::{Context, anyhow};
#[derive(Clone)]
pub struct OpenFigiClient {
pub client: HttpClient,
pub has_key: bool,
}
impl OpenFigiClient {
pub async fn new() -> anyhow::Result<Self> {
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
let has_key = api_key.is_some();
let mut builder = HttpClient::builder()
.user_agent("Mozilla/5.0 (compatible; OpenFIGI-Rust/1.0)")
.timeout(Duration::from_secs(30));
if let Some(key) = &api_key {
let mut headers = HeaderMap::new();
headers.insert("X-OPENFIGI-APIKEY", HeaderValue::from_str(key)?);
builder = builder.default_headers(headers);
}
let client = builder.build().context("Failed to build HTTP client")?;
logger::log_info(&format!("OpenFIGI client: {}",
if has_key { "with API key" } else { "no key" })).await;
Ok(Self { client, has_key })
}
pub async fn map_isins_to_figi_infos(&self, isins: &[String]) -> anyhow::Result<Vec<FigiData>> {
if isins.is_empty() {
return Ok(vec![]);
}
let mut all_figi_infos = Vec::new();
let chunk_size = if self.has_key { 100 } else { 5 };
let inter_sleep = if self.has_key {
Duration::from_millis(240)
} else {
Duration::from_millis(2400)
};
for chunk in isins.chunks(chunk_size) {
let jobs: Vec<Value> = chunk.iter()
.map(|isin| json!({
"idType": "ID_ISIN",
"idValue": isin,
}))
.collect();
let mut retry_count = 0;
let max_retries = 5;
let mut backoff_ms = 1000u64;
loop {
let resp_result = self.client
.post("https://api.openfigi.com/v3/mapping")
.header("Content-Type", "application/json")
.json(&jobs)
.send()
.await;
let resp = match resp_result {
Ok(r) => r,
Err(e) => {
retry_count += 1;
if retry_count >= max_retries {
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
logger::log_error(&err_msg).await;
return Err(anyhow!(err_msg));
}
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
logger::log_warn(&warn_msg).await;
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
logger::log_info(&retry_msg).await;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
continue;
}
};
let status = resp.status();
let headers = resp.headers().clone();
let body = resp.text().await?;
if status == 429 {
let reset_sec = headers
.get("ratelimit-reset")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
sleep(Duration::from_secs(reset_sec.max(10))).await;
continue;
} else if !status.is_success() {
if status.is_server_error() && retry_count < max_retries {
retry_count += 1;
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(60000);
continue;
}
return Err(anyhow!("OpenFIGI error {}: {}", status, body));
}
let results: Vec<Value> = serde_json::from_str(&body)?;
for (isin, result) in chunk.iter().zip(results) {
if let Some(data) = result["data"].as_array() {
for item in data {
if let Some(figi) = item["figi"].as_str() {
all_figi_infos.push(FigiData {
isin: isin.clone(),
figi: figi.to_string(),
name: item["name"].as_str().unwrap_or("").to_string(),
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
exch_code: item["exchCode"].as_str().unwrap_or("").to_string(),
composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
security_type: item["securityType"].as_str().unwrap_or("").to_string(),
market_sector: item["marketSector"].as_str().unwrap_or("").to_string(),
share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
security_type2: item["securityType2"].as_str().unwrap_or("").to_string(),
security_description: item["securityDescription"].as_str().unwrap_or("").to_string(),
});
}
}
}
}
break;
}
sleep(inter_sleep).await;
}
Ok(all_figi_infos)
}
}
/// Fetches and caches the list of valid securityType values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_security_type(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("securityType.json");
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached securityType values").await;
return Ok(());
}
logger::log_info(" Fetching securityType values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/securityType")
.send()
.await
.context("Failed to fetch securityType values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse securityType response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write securityType cache")?;
logger::log_info(" ✓ Cached securityType values").await;
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Loads all OpenFIGI mapping value lists (marketSecDes, micCode, securityType).
///
/// This function fetches the available values for each mapping parameter from the OpenFIGI API
/// and caches them as JSON files in `data/openfigi/`. If the files already exist and are recent
/// (less than 30 days old), they are reused instead of re-fetching.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if API requests fail, JSON parsing fails, or file I/O fails.
pub async fn load_figi_type_lists(paths: &DataPaths) -> anyhow::Result<()> {
logger::log_info("Loading OpenFIGI mapping value lists...").await;
let cache_openfigi_dir = paths.cache_openfigi_dir();
tokio_fs::create_dir_all(cache_openfigi_dir).await
.context("Failed to create data/openfigi directory")?;
let client = OpenFigiClient::new().await?;
// Fetch each type list
get_figi_market_sec_des(&client, cache_openfigi_dir).await?;
get_figi_mic_code(&client, cache_openfigi_dir).await?;
get_figi_security_type(&client, cache_openfigi_dir).await?;
logger::log_info("OpenFIGI mapping value lists loaded successfully").await;
Ok(())
}
/// Fetches and caches the list of valid marketSecDes values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_market_sec_des(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("marketSecDes.json");
// Check if cache exists and is recent (< 30 days old)
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached marketSecDes values").await;
return Ok(());
}
logger::log_info(" Fetching marketSecDes values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/marketSecDes")
.send()
.await
.context("Failed to fetch marketSecDes values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse marketSecDes response")?;
// Save to cache
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write marketSecDes cache")?;
logger::log_info(" ✓ Cached marketSecDes values").await;
// Respect rate limits
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Fetches and caches the list of valid micCode values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_mic_code(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("micCode.json");
if should_use_cache(&cache_file).await? {
logger::log_info(" Using cached micCode values").await;
return Ok(());
}
logger::log_info(" Fetching micCode values from OpenFIGI API...").await;
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/micCode")
.send()
.await
.context("Failed to fetch micCode values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse micCode response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write micCode cache")?;
logger::log_info(" ✓ Cached micCode values").await;
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Handles rate limit responses from the OpenFIGI API.
///
/// If a 429 status is received, this function sleeps for the duration specified
/// in the `ratelimit-reset` header (or 10 seconds by default).
///
/// # Arguments
/// * `resp` - The HTTP response to check.
///
/// # Returns
/// Ok(()) if no rate limit, or after waiting for the reset period.
///
/// # Errors
/// Returns an error if the response status indicates a non-rate-limit error.
async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> {
let status = resp.status();
if status == 429 {
let headers = resp.headers();
let reset_sec = headers
.get("ratelimit-reset")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
logger::log_info(&format!(" Rate limited—waiting {}s", reset_sec)).await;
sleep(std::time::Duration::from_secs(reset_sec.max(10))).await;
return Err(anyhow!("Rate limited, please retry"));
} else if status.is_client_error() || status.is_server_error() {
return Err(anyhow!("OpenFIGI API error: {}", status));
}
Ok(())
}
/// Checks if a cache file exists and is less than 30 days old.
///
/// # Arguments
/// * `path` - Path to the cache file.
///
/// # Returns
/// True if the cache should be used, false if it needs refreshing.
async fn should_use_cache(path: &Path) -> anyhow::Result<bool> {
if !path.exists() {
return Ok(false);
}
let metadata = tokio_fs::metadata(path).await?;
let modified = metadata.modified()?;
let age = modified.elapsed().unwrap_or(std::time::Duration::from_secs(u64::MAX));
// Cache is valid for 30 days
Ok(age < std::time::Duration::from_secs(30 * 24 * 60 * 60))
}

View File

@@ -1,5 +1,9 @@
// src/scraper/webdriver.rs // src/scraper/webdriver.rs
use super::helpers::*; use super::helpers::*;
use super::hard_reset::HardResetController;
use super::docker_vpn_proxy::DockerVpnProxyPool;
use crate::Config;
use crate::logger;
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, ClientBuilder}; use fantoccini::{Client, ClientBuilder};
@@ -13,8 +17,6 @@ use tokio::process::{Child, Command};
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use tokio::sync::{Mutex, Semaphore}; use tokio::sync::{Mutex, Semaphore};
use tokio::time::{sleep, timeout, Duration}; use tokio::time::{sleep, timeout, Duration};
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
use crate::Config;
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding. /// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
pub struct ChromeDriverPool { pub struct ChromeDriverPool {
@@ -31,10 +33,16 @@ pub struct ChromeDriverPool {
min_request_interval_ms: u64, min_request_interval_ms: u64,
monitoring: Option<crate::monitoring::MonitoringHandle>, monitoring: Option<crate::monitoring::MonitoringHandle>,
hard_reset_controller: Arc<HardResetController>,
config: Arc<Config>,
} }
impl ChromeDriverPool { impl ChromeDriverPool {
/// Creates a new pool without any proxy (direct connection). /// When consecutive errors reach this value, execute() will return a special error
/// that signals the caller to trigger a hard reset
const HARD_RESET_ERROR_THRESHOLD: usize = 12;
/// Creates a new pool without any proxy (direct connection).
pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> { pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
Self::new_with_proxy_and_task_limit(None, config, monitoring).await Self::new_with_proxy_and_task_limit(None, config, monitoring).await
} }
@@ -85,6 +93,11 @@ impl ChromeDriverPool {
// Rotation is enabled when task limiting is active // Rotation is enabled when task limiting is active
let rotation_enabled = task_per_instance_limit > 0; let rotation_enabled = task_per_instance_limit > 0;
let half_size = if rotation_enabled {
(actual_pool_size + 1) / 2 // Round up for odd numbers
} else {
actual_pool_size
};
let mut instances = Vec::with_capacity(actual_pool_size); let mut instances = Vec::with_capacity(actual_pool_size);
@@ -105,8 +118,8 @@ impl ChromeDriverPool {
for i in 0..actual_pool_size { for i in 0..actual_pool_size {
// Pass the entire proxy_pool and the index // Pass the entire proxy_pool and the index
let instance = ChromeInstance::new( let instance = ChromeInstance::new(
proxy_pool.clone(), // Clone the Arc proxy_pool.clone(),
i, // This instance's proxy index i,
config, config,
monitoring.clone(), monitoring.clone(),
).await?; ).await?;
@@ -144,7 +157,7 @@ impl ChromeDriverPool {
mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated { mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated {
instance_id: i, instance_id: i,
max_tasks: guard.max_tasks_per_instance, max_tasks: guard.max_tasks_per_instance,
proxy: proxy_info.clone(), // ✅ Now includes actual proxy info proxy: proxy_info.clone(),
}); });
// Also emit ProxyConnected event if proxy exists // Also emit ProxyConnected event if proxy exists
@@ -162,15 +175,21 @@ impl ChromeDriverPool {
let min_request_interval_ms = config.min_request_interval_ms; let min_request_interval_ms = config.min_request_interval_ms;
let hard_reset_controller = Arc::new(HardResetController::new());
let config_clone = Arc::new(config.clone());
Ok(Self { Ok(Self {
instances, instances,
semaphore: Arc::new(Semaphore::new(actual_pool_size)), semaphore: Arc::new(Semaphore::new(half_size)),
proxy_pool, proxy_pool,
rotation_enabled, rotation_enabled,
next_instance: Arc::new(Mutex::new(0)), next_instance: Arc::new(Mutex::new(0)),
last_request_time: Arc::new(Mutex::new(Instant::now())), last_request_time: Arc::new(Mutex::new(Instant::now())),
min_request_interval_ms, min_request_interval_ms,
monitoring, monitoring,
hard_reset_controller,
config: config_clone,
}) })
} }
@@ -188,10 +207,8 @@ impl ChromeDriverPool {
if elapsed < self.min_request_interval_ms { if elapsed < self.min_request_interval_ms {
let wait_ms = self.min_request_interval_ms - elapsed; let wait_ms = self.min_request_interval_ms - elapsed;
drop(last_time); // Lock vor Sleep freigeben! drop(last_time);
sleep(Duration::from_millis(wait_ms)).await; sleep(Duration::from_millis(wait_ms)).await;
let mut last_time = self.last_request_time.lock().await; let mut last_time = self.last_request_time.lock().await;
*last_time = Instant::now(); *last_time = Instant::now();
} else { } else {
@@ -199,12 +216,20 @@ impl ChromeDriverPool {
} }
} }
let random_index = random_range(0, self.instances.len() as u64) as usize; let instance = if self.rotation_enabled {
// Index-Auswahl (vereinfacht, siehe unten für vollständige Rotation) self.select_instance_with_rotation().await?
let index = if self.rotation_enabled {
self.get_rotated_index().await?
} else { } else {
random_index self.select_instance_round_robin().await
};
{
let mut inst = instance.lock().await;
inst.increment_task_count();
}
let index: usize = {
let instances = &self.instances;
instances.iter().position(|inst| Arc::ptr_eq(inst, &instance)).unwrap_or(0)
}; };
if let Some(ref mon) = self.monitoring { if let Some(ref mon) = self.monitoring {
@@ -216,15 +241,10 @@ impl ChromeDriverPool {
instance_id: index, instance_id: index,
status: crate::monitoring::InstanceStatusChange::Active, status: crate::monitoring::InstanceStatusChange::Active,
}); });
} };
let instance = &self.instances[index];
let mut guard = instance.lock().await; let mut guard = instance.lock().await;
// NEU: Session mit automatischer Erneuerung holen!
let client = guard.get_or_renew_session().await?; let client = guard.get_or_renew_session().await?;
guard.increment_task_count();
let (task_count, session_requests) = guard.get_session_stats().await; let (task_count, session_requests) = guard.get_session_stats().await;
crate::util::logger::log_info(&format!( crate::util::logger::log_info(&format!(
@@ -232,17 +252,17 @@ impl ChromeDriverPool {
index, task_count, guard.max_tasks_per_instance, session_requests index, task_count, guard.max_tasks_per_instance, session_requests
)).await; )).await;
drop(guard); // Lock freigeben vor Navigation drop(guard);
let start_time = Instant::now(); let start_time = Instant::now();
// Navigation mit Timeout // Navigation with timeout
let navigation_result = timeout( let navigation_result = timeout(
Duration::from_secs(60), Duration::from_secs(60),
client.goto(&url) client.goto(&url)
).await; ).await;
match navigation_result { let result = match navigation_result {
Ok(Ok(_)) => { Ok(Ok(_)) => {
if let Some(ref mon) = self.monitoring { if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted { mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted {
@@ -258,14 +278,111 @@ impl ChromeDriverPool {
} }
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await; crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
// Parse-Funktion ausführen // Execute parse function
parse(client).await match parse(client).await {
Ok(data) => {
// SUCCESS: Record and log
let prev_count = self.hard_reset_controller.get_count();
self.hard_reset_controller.record_success();
if prev_count > 0 {
logger::log_info(&format!(
"✓ Success - reset counter cleared (was: {}/{})",
prev_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
}
Ok(data)
}
Err(e) => {
// PARSE ERROR: Record, check threshold, invalidate session
let error_count = self.hard_reset_controller.record_error();
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
// Enhanced logging with threshold status
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Parse error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Parse failed: {}. Threshold reached ({}/{})",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Parse failed: {}. Hard reset at {}/{}",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
}
}
} }
Ok(Err(e)) => { Ok(Err(e)) => {
// ❌ NAVIGATION ERROR: Record, check threshold, invalidate session
crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await; crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await;
Err(anyhow!("Navigation failed: {}", e))
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
let error_count = self.hard_reset_controller.record_error();
// Enhanced logging
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Navigation error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Navigation failed: {}. Threshold reached ({}/{})",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Navigation failed: {}. Hard reset at {}/{}",
e,
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
} }
Err(_) => { Err(_) => {
// ❌ TIMEOUT ERROR: Record, check threshold, invalidate session
if let Some(ref mon) = self.monitoring { if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout { mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout {
instance_id: index, instance_id: index,
@@ -273,69 +390,178 @@ impl ChromeDriverPool {
}); });
} }
let error_count = self.hard_reset_controller.record_error();
crate::util::logger::log_error("Navigation timeout (60s)").await; crate::util::logger::log_error("Navigation timeout (60s)").await;
Err(anyhow!("Navigation timeout"))
{
let mut inst = instance.lock().await;
inst.invalidate_current_session().await;
}
// Enhanced logging
let threshold_pct = (error_count as f64 / Self::HARD_RESET_ERROR_THRESHOLD as f64) * 100.0;
logger::log_warn(&format!(
"Timeout error. Reset counter: {}/{} ({:.0}%)",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD,
threshold_pct
)).await;
// Check if threshold reached
if error_count >= Self::HARD_RESET_ERROR_THRESHOLD {
logger::log_error(&format!(
"🔴 HARD RESET THRESHOLD REACHED ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
)).await;
return Err(anyhow!(
"HARD_RESET_REQUIRED: Navigation timeout. Threshold reached ({}/{})",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
));
}
Err(anyhow!(
"Navigation timeout. Hard reset at {}/{}",
error_count,
Self::HARD_RESET_ERROR_THRESHOLD
))
} }
};
{
let mut inst = instance.lock().await;
inst.task_count = inst.task_count.saturating_sub(1);
} }
result
} }
async fn get_rotated_index(&self) -> Result<usize> { /// Simple round-robin instance selection (no rotation)
let total = self.instances.len(); async fn select_instance_round_robin(&self) -> Arc<Mutex<ChromeInstance>> {
let half_size = total / 2; let mut next = self.next_instance.lock().await;
let index = *next;
*next = (*next + 1) % self.instances.len();
drop(next);
Arc::clone(&self.instances[index])
}
/// Round-robin with half-pool rotation
async fn select_instance_with_rotation(&self) -> Result<Arc<Mutex<ChromeInstance>>> {
let pool_size = self.instances.len();
let half_size = pool_size / 2;
if half_size == 0 { if half_size == 0 {
return Ok(0); // Pool zu klein für Rotation // Pool too small for rotation, fall back to simple round-robin
return Ok(self.select_instance_round_robin().await);
} }
let mut next_idx = self.next_instance.lock().await; let mut next = self.next_instance.lock().await;
let current_half_start = if *next_idx < half_size { 0 } else { half_size }; let current_half_start = (*next / half_size) * half_size;
let current_half_end = if *next_idx < half_size { half_size } else { total }; let current_half_end = (current_half_start + half_size).min(pool_size);
// Suche verfügbare Instanz in aktueller Hälfte // Try to find available instance in current half
for offset in 0..(current_half_end - current_half_start) { let mut attempts = 0;
let candidate_idx = current_half_start + ((*next_idx + offset) % half_size); let max_attempts = half_size * 2; // Try both halves
let instance = &self.instances[candidate_idx]; while attempts < max_attempts {
let guard = instance.lock().await; let index = current_half_start + (*next % half_size);
let instance = &self.instances[index];
if guard.max_tasks_per_instance == 0 || // Check if instance can accept more tasks
guard.task_count < guard.max_tasks_per_instance { let mut inst = instance.lock().await;
*next_idx = (candidate_idx + 1) % total; let can_accept = inst.get_task_count() < inst.max_tasks_per_instance;
drop(guard); drop(inst);
return Ok(candidate_idx);
if can_accept {
*next = (*next + 1) % pool_size;
drop(next);
if let Some(ref mon) = self.monitoring {
mon.emit(crate::monitoring::MonitoringEvent::InstanceSelected {
instance_id: index,
half: if index < half_size { 1 } else { 2 },
});
}
return Ok(Arc::clone(instance));
} }
// Current half saturated, try other half
if attempts == half_size - 1 {
logger::log_info("Current half saturated, rotating to other half").await;
*next = if current_half_start == 0 { half_size } else { 0 };
} else {
*next = (*next + 1) % pool_size;
}
attempts += 1;
} }
// Aktuelle Hälfte voll → Zur anderen wechseln drop(next);
crate::util::logger::log_info("Current half saturated, rotating to other half").await;
let new_half_start = if current_half_start == 0 { half_size } else { 0 }; // All instances saturated
let new_half_end = if current_half_start == 0 { total } else { half_size }; Err(anyhow!("All instances at task capacity"))
}
// Alte Hälfte zurücksetzen (für nächste Rotation) pub fn get_reset_controller(&self) -> Arc<HardResetController> {
for i in current_half_start..current_half_end { Arc::clone(&self.hard_reset_controller)
let mut instance = self.instances[i].lock().await; }
instance.reset_task_count();
}
*next_idx = new_half_start; /// Check if hard reset threshold has been reached
drop(next_idx); pub fn should_perform_hard_reset(&self) -> bool {
self.hard_reset_controller.get_count() >= Self::HARD_RESET_ERROR_THRESHOLD
}
Ok(new_half_start) /// Get current error count and threshold for monitoring
pub fn get_reset_status(&self) -> (usize, usize) {
(
self.hard_reset_controller.get_count(),
Self::HARD_RESET_ERROR_THRESHOLD
)
} }
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers. /// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
/// ✅ FIXED: Now with proper error propagation and Chrome process cleanup
pub async fn shutdown(&self) -> Result<()> { pub async fn shutdown(&self) -> Result<()> {
for inst in &self.instances { logger::log_info(&format!("Shutting down {} ChromeDriver instances...", self.instances.len())).await;
let mut shutdown_errors = Vec::new();
for (i, inst) in self.instances.iter().enumerate() {
logger::log_info(&format!(" Shutting down instance {}...", i)).await;
let mut guard = inst.lock().await; let mut guard = inst.lock().await;
guard.shutdown().await?; if let Err(e) = guard.shutdown().await {
logger::log_error(&format!(" ✗ Instance {} shutdown error: {}", i, e)).await;
shutdown_errors.push(format!("Instance {}: {}", i, e));
} else {
logger::log_info(&format!(" ✓ Instance {} shut down", i)).await;
}
} }
if let Some(pp) = &self.proxy_pool { if let Some(pp) = &self.proxy_pool {
pp.shutdown().await?; logger::log_info("Shutting down proxy pool...").await;
crate::util::logger::log_info("All Docker VPN proxy containers stopped").await; if let Err(e) = pp.shutdown().await {
logger::log_error(&format!("Proxy pool shutdown error: {}", e)).await;
shutdown_errors.push(format!("Proxy pool: {}", e));
} else {
logger::log_info("✓ Proxy pool shut down").await;
}
} }
if !shutdown_errors.is_empty() {
return Err(anyhow!(
"Pool shutdown completed with {} error(s): {}",
shutdown_errors.len(),
shutdown_errors.join("; ")
));
}
logger::log_info("✓ All ChromeDriver instances shut down successfully").await;
Ok(()) Ok(())
} }
@@ -356,6 +582,9 @@ impl ChromeDriverPool {
self.instances.len() self.instances.len()
} }
} }
pub fn get_proxy_pool(&self) -> Option<Arc<DockerVpnProxyPool>> {
self.proxy_pool.clone()
}
} }
/// Represents a single instance of chromedriver process, optionally bound to a VPN. /// Represents a single instance of chromedriver process, optionally bound to a VPN.
@@ -369,9 +598,9 @@ pub struct ChromeInstance {
current_session: Arc<Mutex<Option<Client>>>, // Current active session current_session: Arc<Mutex<Option<Client>>>, // Current active session
session_request_count: Arc<Mutex<usize>>, session_request_count: Arc<Mutex<usize>>,
max_requests_per_session: usize, // z.B. 25 max_requests_per_session: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Reference to the proxy pool
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
instance_id: usize, instance_id: usize,
@@ -408,15 +637,13 @@ impl ChromeInstance {
}) })
} }
pub async fn get_or_renew_session(&self) -> Result<Client> { pub async fn get_or_renew_session(&mut self) -> Result<Client> {
let mut session_opt = self.current_session.lock().await; let mut session_opt = self.current_session.lock().await;
let mut request_count = self.session_request_count.lock().await; let mut request_count = self.session_request_count.lock().await;
let old_request_count = *request_count; // Session renewal conditions:
// 1. No session exists
// Session erneuern wenn: // 2. Request limit reached
// 1. Keine Session vorhanden
// 2. Request-Limit erreicht
let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session; let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session;
if needs_renewal { if needs_renewal {
@@ -427,16 +654,22 @@ impl ChromeInstance {
}); });
} }
// Alte Session schließen // ✅ FIXED: Close old session with proper error handling
if let Some(old_session) = session_opt.take() { if let Some(old_session) = session_opt.take() {
crate::util::logger::log_info("Closing old session").await; crate::util::logger::log_info("Closing old session").await;
let _ = old_session.close().await;
// Kurze Pause zwischen Sessions // Try to close gracefully first
if let Err(e) = old_session.close().await {
logger::log_warn(&format!("Session close failed (may leave Chrome tabs open): {}", e)).await;
// Continue anyway - we'll force-kill if needed
}
// Brief pause between sessions
let random_delay = random_range(500, 1000); let random_delay = random_range(500, 1000);
sleep(Duration::from_millis(random_delay)).await; sleep(Duration::from_millis(random_delay)).await;
} }
// Neue Session mit frischem User-Agent erstellen // Create new session with fresh User-Agent
crate::util::logger::log_info(&format!( crate::util::logger::log_info(&format!(
"Creating new session (requests in last session: {})", "Creating new session (requests in last session: {})",
*request_count *request_count
@@ -476,29 +709,35 @@ impl ChromeInstance {
mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed { mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed {
instance_id: self.instance_id, instance_id: self.instance_id,
old_request_count: *request_count, old_request_count: *request_count,
reason: crate::monitoring::RenewalReason::RequestLimit, reason: reason,
new_proxy: new_proxy_info, new_proxy: new_proxy_info,
}); });
} }
Ok(new_session) Ok(new_session)
} else { } else {
// Existierende Session verwenden // Use existing session
*request_count += 1; *request_count += 1;
Ok(session_opt.as_ref().unwrap().clone()) Ok(session_opt.as_ref().unwrap().clone())
} }
} }
async fn create_fresh_session(&self) -> Result<Client> { async fn create_fresh_session(&self) -> Result<Client> {
// Hole aktuellen Proxy-URL ohne self zu mutieren
let proxy_url = if let Some(ref pool) = self.proxy_pool { let proxy_url = if let Some(ref pool) = self.proxy_pool {
let mut proxy_idx = self.current_proxy_index.lock().await; let mut proxy_idx = self.current_proxy_index.lock().await;
*proxy_idx = (*proxy_idx + 1) % pool.num_proxies(); let num_proxies = pool.num_proxies();
let url = pool.get_proxy_url(*proxy_idx);
crate::util::logger::log_info(&format!( // Round-robin through all proxies
"Using proxy {} for new session", let selected_proxy = *proxy_idx % num_proxies;
*proxy_idx *proxy_idx = (*proxy_idx + 1) % num_proxies;
let url = pool.get_proxy_url(selected_proxy);
logger::log_info(&format!(
"Instance {} creating session with proxy {}/{} (rotation)",
self.instance_id,
selected_proxy,
num_proxies
)).await; )).await;
Some(url) Some(url)
@@ -509,45 +748,39 @@ impl ChromeInstance {
let user_agent = Self::chrome_user_agent(); let user_agent = Self::chrome_user_agent();
let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url); let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url);
ClientBuilder::native() let client = ClientBuilder::native()
.capabilities(capabilities) .capabilities(capabilities)
.connect(&self.base_url) .connect(&self.base_url)
.await .await
.context("Failed to connect to ChromeDriver") .context("Failed to connect to ChromeDriver")?;
// ✅ NEW: Extract and store Chrome PID for cleanup
// Chrome process info can be extracted from session info if needed
// For now, we rely on killing the process tree
Ok(client)
} }
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> { pub async fn invalidate_current_session(&self) {
let mut args = vec![ let mut session_opt = self.current_session.lock().await;
"--headless=new".to_string(),
"--disable-gpu".to_string(),
"--no-sandbox".to_string(),
"--disable-dev-shm-usage".to_string(),
"--disable-infobars".to_string(),
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
format!("--user-agent={}", user_agent),
];
if let Some(proxy) = proxy_url { if let Some(old_session) = session_opt.take() {
args.push(format!("--proxy-server={}", proxy)); crate::util::logger::log_info(&format!(
"Invalidating broken session for instance {}",
self.instance_id
)).await;
// ✅ FIXED: Proper error handling instead of silent failure
if let Err(e) = old_session.close().await {
logger::log_warn(&format!(
"Failed to close broken session (Chrome tabs may remain): {}",
e
)).await;
}
} }
let caps = serde_json::json!({ let mut request_count = self.session_request_count.lock().await;
"goog:chromeOptions": { *request_count = 0;
"args": args,
"excludeSwitches": ["enable-logging", "enable-automation"],
"prefs": {
"profile.default_content_setting_values.notifications": 2
}
}
});
caps.as_object().cloned().unwrap()
} }
pub fn reset_task_count(&mut self) { pub fn reset_task_count(&mut self) {
@@ -567,17 +800,103 @@ impl ChromeInstance {
self.task_count self.task_count
} }
/// ✅ FIXED: Proper Chrome + ChromeDriver shutdown with process tree killing
pub async fn shutdown(&mut self) -> Result<()> { pub async fn shutdown(&mut self) -> Result<()> {
logger::log_info(&format!("Shutting down ChromeInstance {}...", self.instance_id)).await;
// Step 1: Close any active session to signal Chrome to close
{
let mut session_opt = self.current_session.lock().await;
if let Some(session) = session_opt.take() {
logger::log_info(" Closing active session...").await;
if let Err(e) = session.close().await {
logger::log_warn(&format!(" Session close failed: {}", e)).await;
}
}
}
// Step 2: Abort stderr logging task
if let Some(handle) = self.stderr_log.take() { if let Some(handle) = self.stderr_log.take() {
handle.abort(); handle.abort();
let _ = handle.await; let _ = handle.await;
} }
let _ = self.process.start_kill(); // Step 3: Get ChromeDriver PID before killing
let _ = self.process.wait().await; let chromedriver_pid = self.process.id();
logger::log_info(&format!(" ChromeDriver PID: {:?}", chromedriver_pid)).await;
// Step 4: Kill ChromeDriver and wait
if let Err(e) = self.process.start_kill() {
logger::log_warn(&format!(" Failed to kill ChromeDriver: {}", e)).await;
}
// Wait for ChromeDriver to exit (with timeout)
match timeout(Duration::from_secs(5), self.process.wait()).await {
Ok(Ok(status)) => {
logger::log_info(&format!(" ChromeDriver exited with status: {:?}", status)).await;
}
Ok(Err(e)) => {
logger::log_warn(&format!(" Error waiting for ChromeDriver: {}", e)).await;
}
Err(_) => {
logger::log_warn(" ChromeDriver didn't exit within 5s").await;
}
}
// Step 5: ✅ CRITICAL FIX: Force-kill Chrome process tree
// On Windows, Chrome doesn't die when ChromeDriver dies
if let Some(pid) = chromedriver_pid {
logger::log_info(&format!(" Force-killing Chrome process tree for PID {}...", pid)).await;
#[cfg(target_os = "windows")]
{
// Kill entire process tree on Windows
let _ = Command::new("taskkill")
.args(["/F", "/T", "/PID", &pid.to_string()])
.output()
.await;
// Also kill any remaining chrome.exe processes
let _ = Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
}
#[cfg(not(target_os = "windows"))]
{
// Kill process group on Unix
let _ = Command::new("pkill")
.args(["-P", &pid.to_string()])
.output()
.await;
}
logger::log_info(" ✓ Chrome process tree killed").await;
}
// Step 6: Wait a moment for processes to fully terminate
sleep(Duration::from_millis(500)).await;
logger::log_info(&format!("✓ ChromeInstance {} shut down", self.instance_id)).await;
Ok(()) Ok(())
} }
pub fn is_available(&self) -> bool {
if self.max_tasks_per_instance == 0 {
return true; // No limit
}
self.task_count < self.max_tasks_per_instance
}
pub fn tasks_remaining(&self) -> usize {
if self.max_tasks_per_instance == 0 {
return usize::MAX;
}
self.max_tasks_per_instance.saturating_sub(self.task_count)
}
/// Spawns the actual `chromedriver` binary and waits for it to become ready. /// Spawns the actual `chromedriver` binary and waits for it to become ready.
async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> { async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> {
let mut process = Command::new("chromedriver-win64/chromedriver.exe") let mut process = Command::new("chromedriver-win64/chromedriver.exe")
@@ -624,6 +943,40 @@ impl ChromeInstance {
Err(anyhow!("ChromeDriver failed to start within 30s")) Err(anyhow!("ChromeDriver failed to start within 30s"))
} }
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
let mut args = vec![
"--headless=new".to_string(),
"--disable-gpu".to_string(),
"--no-sandbox".to_string(),
"--disable-dev-shm-usage".to_string(),
"--disable-infobars".to_string(),
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
format!("--user-agent={}", user_agent),
];
if let Some(proxy) = proxy_url {
args.push(format!("--proxy-server={}", proxy));
}
let caps = serde_json::json!({
"goog:chromeOptions": {
"args": args,
"excludeSwitches": ["enable-logging", "enable-automation"],
"prefs": {
"profile.default_content_setting_values.notifications": 2
}
}
});
caps.as_object().cloned().unwrap()
}
pub fn chrome_user_agent() -> &'static str { pub fn chrome_user_agent() -> &'static str {
static UAS: &[&str] = &[ static UAS: &[&str] = &[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
@@ -636,6 +989,24 @@ impl ChromeInstance {
} }
} }
impl Drop for ChromeInstance {
fn drop(&mut self) {
// Signal both ChromeDriver and Chrome to terminate
let _ = self.process.start_kill();
// Also try to kill Chrome if we know the PID
if let Some(pid) = self.process.id() {
#[cfg(target_os = "windows")]
{
// Fire and forget - this is best-effort cleanup
let _ = std::process::Command::new("taskkill")
.args(["/F", "/T", "/PID", &pid.to_string()])
.output();
}
}
}
}
fn parse_chromedriver_address(line: &str) -> Option<String> { fn parse_chromedriver_address(line: &str) -> Option<String> {
if line.contains("Starting ChromeDriver") { if line.contains("Starting ChromeDriver") {
if let Some(port_str) = line.split("on port ").nth(1) { if let Some(port_str) = line.split("on port ").nth(1) {
@@ -656,14 +1027,6 @@ fn parse_chromedriver_address(line: &str) -> Option<String> {
None None
} }
impl Drop for ChromeInstance {
fn drop(&mut self) {
// Signal child to terminate. Do NOT block here; shutdown should be
// performed with the async `shutdown()` method when possible.
let _ = self.process.start_kill();
}
}
/// Simplified task execution - uses the pool pattern. /// Simplified task execution - uses the pool pattern.
pub struct ScrapeTask<T> { pub struct ScrapeTask<T> {
url: String, url: String,

1519
src/scraper/yahoo.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -2,23 +2,26 @@ use std::path::{Path, PathBuf};
use std::fs; use std::fs;
/// Central configuration for all data paths /// Central configuration for all data paths
#[derive(Clone)]
pub struct DataPaths { pub struct DataPaths {
base_dir: PathBuf, base_dir: PathBuf,
data_dir: PathBuf, data_dir: PathBuf,
cache_dir: PathBuf, cache_dir: PathBuf,
logs_dir: PathBuf, logs_dir: PathBuf,
integrity_dir: PathBuf,
// Cache data subdirectories // Cache data subdirectories
cache_gleif_dir: PathBuf, cache_gleif_dir: PathBuf,
cache_openfigi_dir: PathBuf, cache_openfigi_dir: PathBuf,
cache_gleif_openfigi_map_dir: PathBuf, cache_gleif_openfigi_map_dir: PathBuf,
cache_openvpn_dir: PathBuf, cache_openvpn_dir: PathBuf,
// Figi Securities data subdirectories
figi_securities_dir: PathBuf,
// Economic data subdirectories // Economic data subdirectories
economic_events_dir: PathBuf, economic_events_dir: PathBuf,
economic_changes_dir: PathBuf, economic_changes_dir: PathBuf,
economic_currency_dir: PathBuf,
// Corporate data subdirectories // Corporate data subdirectories
corporate_events_dir: PathBuf, corporate_dir: PathBuf,
corporate_changes_dir: PathBuf,
corporate_prices_dir: PathBuf,
} }
impl DataPaths { impl DataPaths {
@@ -29,6 +32,7 @@ impl DataPaths {
let data_dir = base_dir.join("data"); let data_dir = base_dir.join("data");
let cache_dir = base_dir.join("cache"); let cache_dir = base_dir.join("cache");
let logs_dir = base_dir.join("logs"); let logs_dir = base_dir.join("logs");
let integrity_dir = base_dir.join("integrity");
// Cache subdirectories // Cache subdirectories
let cache_gleif_dir = cache_dir.join("gleif"); let cache_gleif_dir = cache_dir.join("gleif");
@@ -36,44 +40,47 @@ impl DataPaths {
let cache_gleif_openfigi_map_dir = cache_dir.join("glei_openfigi"); let cache_gleif_openfigi_map_dir = cache_dir.join("glei_openfigi");
let cache_openvpn_dir = cache_dir.join("openvpn"); let cache_openvpn_dir = cache_dir.join("openvpn");
// Figi Securities subdirectories
let figi_securities_dir = data_dir.join("figi_securities");
// Economic subdirectories // Economic subdirectories
let economic_events_dir = data_dir.join("economic").join("events"); let economic_events_dir = data_dir.join("economic").join("events");
let economic_changes_dir = economic_events_dir.join("changes"); let economic_changes_dir = economic_events_dir.join("changes");
let economic_currency_dir = data_dir.join("economic").join("currency");
// Corporate subdirectories // Corporate subdirectories
let corporate_dir = data_dir.join("corporate"); let corporate_dir = data_dir.join("corporate");
let corporate_events_dir = corporate_dir.join("events");
let corporate_changes_dir = corporate_events_dir.join("changes");
let corporate_prices_dir = corporate_dir.join("prices");
// Create all directories if they don't exist // Create all directories if they don't exist
fs::create_dir_all(&data_dir)?; fs::create_dir_all(&data_dir)?;
fs::create_dir_all(&cache_dir)?; fs::create_dir_all(&cache_dir)?;
fs::create_dir_all(&logs_dir)?; fs::create_dir_all(&logs_dir)?;
fs::create_dir_all(&integrity_dir)?;
fs::create_dir_all(&cache_gleif_dir)?; fs::create_dir_all(&cache_gleif_dir)?;
fs::create_dir_all(&cache_openfigi_dir)?; fs::create_dir_all(&cache_openfigi_dir)?;
fs::create_dir_all(&cache_gleif_openfigi_map_dir)?; fs::create_dir_all(&cache_gleif_openfigi_map_dir)?;
fs::create_dir_all(&cache_openvpn_dir)?; fs::create_dir_all(&cache_openvpn_dir)?;
fs::create_dir_all(&figi_securities_dir)?;
fs::create_dir_all(&economic_events_dir)?; fs::create_dir_all(&economic_events_dir)?;
fs::create_dir_all(&economic_changes_dir)?; fs::create_dir_all(&economic_changes_dir)?;
fs::create_dir_all(&corporate_events_dir)?; fs::create_dir_all(&economic_currency_dir)?;
fs::create_dir_all(&corporate_changes_dir)?; fs::create_dir_all(&corporate_dir)?;
fs::create_dir_all(&corporate_prices_dir)?;
Ok(Self { Ok(Self {
base_dir, base_dir,
data_dir, data_dir,
cache_dir, cache_dir,
logs_dir, logs_dir,
integrity_dir,
cache_gleif_dir, cache_gleif_dir,
cache_openfigi_dir, cache_openfigi_dir,
cache_gleif_openfigi_map_dir, cache_gleif_openfigi_map_dir,
cache_openvpn_dir, cache_openvpn_dir,
figi_securities_dir,
economic_events_dir, economic_events_dir,
economic_changes_dir, economic_changes_dir,
corporate_events_dir, economic_currency_dir,
corporate_changes_dir, corporate_dir,
corporate_prices_dir,
}) })
} }
@@ -89,6 +96,10 @@ impl DataPaths {
&self.cache_dir &self.cache_dir
} }
pub fn integrity_dir(&self) -> &Path {
&self.integrity_dir
}
pub fn logs_dir(&self) -> &Path { pub fn logs_dir(&self) -> &Path {
&self.logs_dir &self.logs_dir
} }
@@ -109,6 +120,10 @@ impl DataPaths {
&self.cache_openvpn_dir &self.cache_openvpn_dir
} }
pub fn figi_securities_dir(&self) -> &Path {
&self.figi_securities_dir
}
/// Get the economic events directory /// Get the economic events directory
pub fn economic_events_dir(&self) -> &Path { pub fn economic_events_dir(&self) -> &Path {
&self.economic_events_dir &self.economic_events_dir
@@ -119,19 +134,13 @@ impl DataPaths {
&self.economic_changes_dir &self.economic_changes_dir
} }
pub fn economic_currency_dir(&self) -> &Path {
&self.economic_currency_dir
}
/// Get the corporate events directory /// Get the corporate events directory
pub fn corporate_events_dir(&self) -> &Path { pub fn corporate_dir(&self) -> &Path {
&self.corporate_events_dir &self.corporate_dir
}
/// Get the corporate changes directory
pub fn corporate_changes_dir(&self) -> &Path {
&self.corporate_changes_dir
}
/// Get the corporate prices directory
pub fn corporate_prices_dir(&self) -> &Path {
&self.corporate_prices_dir
} }
/// Get a specific file path within data directory /// Get a specific file path within data directory
@@ -162,8 +171,5 @@ mod tests {
assert!(paths.logs_dir().exists()); assert!(paths.logs_dir().exists());
assert!(paths.economic_events_dir().exists()); assert!(paths.economic_events_dir().exists());
assert!(paths.economic_changes_dir().exists()); assert!(paths.economic_changes_dir().exists());
assert!(paths.corporate_events_dir().exists());
assert!(paths.corporate_changes_dir().exists());
assert!(paths.corporate_prices_dir().exists());
} }
} }

911
src/util/integrity.rs Normal file
View File

@@ -0,0 +1,911 @@
// src/util/integrity.rs
//! Content integrity and state lifecycle management module
//!
//! Features:
//! - File and directory hashing (SHA-256)
//! - Hash validation against content references
//! - State invalidation based on time or validation failures
//! - 3-stage data lifecycle: cache → data → storage
//! - Inline vs. external hash storage based on size
//! - Centralized dependency configuration (Single Source of Truth)
//! - Support for checkpoint groups and hierarchies
//! - Automatic transitive dependency resolution
//! - Cycle detection in dependency graph
use anyhow::{Context, Result, bail};
use chrono::{DateTime, Duration, Utc};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::{BufReader, Read};
use std::path::{Path, PathBuf};
use tokio::fs as async_fs;
use tokio::io::AsyncWriteExt;
// ============================================================================
// CONSTANTS
// ============================================================================
const INLINE_HASH_THRESHOLD: usize = 1024;
const HASH_STORAGE_DIR: &str = ".integrity_hashes";
const HASH_FILE_EXT: &str = ".hash";
const DEFAULT_DEPENDENCY_CONFIG: &str = "checkpoint_dependencies.toml";
// ============================================================================
// DEPENDENCY CONFIGURATION
// ============================================================================
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct DependencyConfig {
#[serde(default)]
pub checkpoints: HashMap<String, CheckpointConfig>,
#[serde(default)]
pub groups: HashMap<String, GroupConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CheckpointConfig {
#[serde(default)]
pub description: String,
#[serde(default)]
pub depends_on: Vec<String>,
#[serde(default)]
pub group: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroupConfig {
#[serde(default)]
pub description: String,
pub members: Vec<String>,
#[serde(default)]
pub depends_on: Vec<String>,
}
impl DependencyConfig {
/// Load from file or return empty config
pub async fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
if !path.exists() {
return Ok(Self::default());
}
let content = async_fs::read_to_string(path).await
.with_context(|| format!("Failed to read: {}", path.display()))?;
let config: Self = toml::from_str(&content)
.context("Failed to parse dependency config")?;
config.validate()?;
Ok(config)
}
/// Validate configuration (checks for cycles and invalid references)
pub fn validate(&self) -> Result<()> {
// Check for cycles
for checkpoint in self.checkpoints.keys() {
self.detect_cycle(checkpoint)?;
}
// Validate group memberships
for (group_name, group) in &self.groups {
for member in &group.members {
if !self.checkpoints.contains_key(member) {
bail!("Group '{}' references unknown checkpoint: {}", group_name, member);
}
}
}
// Validate checkpoint group declarations
for (checkpoint_name, checkpoint) in &self.checkpoints {
if let Some(group_name) = &checkpoint.group {
let group = self.groups.get(group_name)
.ok_or_else(|| anyhow::anyhow!("Checkpoint '{}' references unknown group: {}", checkpoint_name, group_name))?;
if !group.members.contains(checkpoint_name) {
bail!("Checkpoint '{}' claims group '{}' but group doesn't list it",
checkpoint_name, group_name);
}
}
}
Ok(())
}
/// Detect cycles using DFS
fn detect_cycle(&self, start: &str) -> Result<()> {
let mut visited = HashSet::new();
let mut stack = HashSet::new();
self.dfs_cycle_check(start, &mut visited, &mut stack)
}
fn dfs_cycle_check(&self, node: &str, visited: &mut HashSet<String>, stack: &mut HashSet<String>) -> Result<()> {
if stack.contains(node) {
bail!("Cycle detected at checkpoint: {}", node);
}
if visited.contains(node) {
return Ok(());
}
visited.insert(node.to_string());
stack.insert(node.to_string());
if let Some(config) = self.checkpoints.get(node) {
for dep in &config.depends_on {
self.dfs_cycle_check(dep, visited, stack)?;
}
}
stack.remove(node);
Ok(())
}
/// Get all dependencies (including transitive and group dependencies)
pub fn get_all_dependencies(&self, checkpoint: &str) -> Result<Vec<String>> {
let mut deps = Vec::new();
let mut visited = HashSet::new();
self.collect_deps(checkpoint, &mut deps, &mut visited)?;
// Remove duplicates while preserving order
let mut seen = HashSet::new();
deps.retain(|d| seen.insert(d.clone()));
Ok(deps)
}
fn collect_deps(&self, node: &str, deps: &mut Vec<String>, visited: &mut HashSet<String>) -> Result<()> {
if visited.contains(node) {
return Ok(());
}
visited.insert(node.to_string());
let config = self.checkpoints.get(node)
.ok_or_else(|| anyhow::anyhow!("Unknown checkpoint: {}", node))?;
// Add group dependencies first
if let Some(group_name) = &config.group {
if let Some(group) = self.groups.get(group_name) {
for dep in &group.depends_on {
if !visited.contains(dep) {
deps.push(dep.clone());
self.collect_deps(dep, deps, visited)?;
}
}
}
}
// Add direct dependencies
for dep in &config.depends_on {
if !visited.contains(dep) {
deps.push(dep.clone());
self.collect_deps(dep, deps, visited)?;
}
}
Ok(())
}
/// Generate DOT format for visualization
pub fn to_dot(&self) -> String {
let mut dot = String::from("digraph Dependencies {\n rankdir=LR;\n node [shape=box];\n\n");
// Nodes
for (name, config) in &self.checkpoints {
let label = if config.description.is_empty() {
name.clone()
} else {
format!("{}\\n{}", name, config.description)
};
dot.push_str(&format!(" \"{}\" [label=\"{}\"];\n", name, label));
}
// Edges
dot.push_str("\n");
for (name, config) in &self.checkpoints {
// Group dependencies
if let Some(group_name) = &config.group {
if let Some(group) = self.groups.get(group_name) {
for dep in &group.depends_on {
dot.push_str(&format!(" \"{}\" -> \"{}\" [label=\"via {}\"];\n", name, dep, group_name));
}
}
}
// Direct dependencies
for dep in &config.depends_on {
dot.push_str(&format!(" \"{}\" -> \"{}\";\n", name, dep));
}
}
dot.push_str("}\n");
dot
}
}
// ============================================================================
// DATA STRUCTURES
// ============================================================================
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum ContentReference {
File { path: PathBuf },
Directory {
path: PathBuf,
include_patterns: Option<Vec<String>>,
exclude_patterns: Option<Vec<String>>,
},
Composite { references: Vec<ContentReference> },
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "storage", rename_all = "lowercase")]
pub enum HashStorage {
Inline { hash: String },
External { hash_file: PathBuf },
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "lowercase")]
pub enum DataStage {
Cache,
Data,
Storage,
}
impl DataStage {
pub fn default_ttl(&self) -> Duration {
match self {
Self::Cache => Duration::hours(24),
Self::Data => Duration::days(7),
Self::Storage => Duration::days(365),
}
}
pub fn revalidation_interval(&self) -> Duration {
match self {
Self::Cache => Duration::hours(6),
Self::Data => Duration::days(1),
Self::Storage => Duration::days(30),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StateEntry {
pub step_name: String,
pub completed: bool,
pub completed_at: Option<DateTime<Utc>>,
pub content_reference: Option<ContentReference>,
pub content_hash: Option<HashStorage>,
pub data_stage: Option<DataStage>,
pub ttl_override: Option<Duration>,
pub last_validated_at: Option<DateTime<Utc>>,
pub validation_status: ValidationStatus,
#[serde(default)]
pub dependencies: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum ValidationStatus {
Unknown,
Valid,
Invalid { reason: String },
Expired,
DependencyFailed { failed_dependency: String },
}
// ============================================================================
// HASH COMPUTATION
// ============================================================================
/// Hash a single file using SHA-256
pub fn hash_file<P: AsRef<Path>>(path: P) -> Result<String> {
let path = path.as_ref();
let file = fs::File::open(path)
.with_context(|| format!("Failed to open: {}", path.display()))?;
let mut reader = BufReader::new(file);
let mut hasher = Sha256::new();
let mut buffer = [0u8; 8192];
loop {
let bytes_read = reader.read(&mut buffer)?;
if bytes_read == 0 { break; }
hasher.update(&buffer[..bytes_read]);
}
Ok(format!("{:x}", hasher.finalize()))
}
/// Hash a directory recursively
pub fn hash_directory<P: AsRef<Path>>(
path: P,
include_patterns: Option<&[String]>,
exclude_patterns: Option<&[String]>,
) -> Result<String> {
let path = path.as_ref();
if !path.is_dir() {
bail!("Not a directory: {}", path.display());
}
let mut files = Vec::new();
collect_files_recursive(path, &mut files, include_patterns, exclude_patterns)?;
files.sort();
if files.is_empty() {
return Ok(String::from("d41d8cd98f00b204e9800998ecf8427e")); // Empty hash
}
let mut hasher = Sha256::new();
for file_path in files {
let rel_path = file_path.strip_prefix(path)
.unwrap_or(&file_path)
.to_string_lossy();
hasher.update(rel_path.as_bytes());
hasher.update(hash_file(&file_path)?.as_bytes());
}
Ok(format!("{:x}", hasher.finalize()))
}
fn collect_files_recursive(
dir: &Path,
files: &mut Vec<PathBuf>,
include: Option<&[String]>,
exclude: Option<&[String]>,
) -> Result<()> {
for entry in fs::read_dir(dir)? {
let path = entry?.path();
// Skip hidden files
if path.file_name()
.and_then(|n| n.to_str())
.map_or(false, |n| n.starts_with('.')) {
continue;
}
if path.is_dir() {
collect_files_recursive(&path, files, include, exclude)?;
} else if path.is_file() && should_include(&path, include, exclude) {
files.push(path);
}
}
Ok(())
}
fn should_include(path: &Path, include: Option<&[String]>, exclude: Option<&[String]>) -> bool {
let path_str = path.to_string_lossy();
// Check exclusions first
if let Some(patterns) = exclude {
if patterns.iter().any(|p| glob_match(&path_str, p)) {
return false;
}
}
// Check inclusions
match include {
Some(patterns) => patterns.iter().any(|p| glob_match(&path_str, p)),
None => true,
}
}
fn glob_match(path: &str, pattern: &str) -> bool {
if pattern.contains('*') {
let parts: Vec<&str> = pattern.split('*').collect();
if parts.len() == 2 {
path.contains(parts[0]) && path.ends_with(parts[1])
} else {
false
}
} else {
path.ends_with(pattern)
}
}
/// Hash content based on reference type
pub fn hash_content_reference(reference: &ContentReference) -> Result<String> {
match reference {
ContentReference::File { path } => hash_file(path),
ContentReference::Directory { path, include_patterns, exclude_patterns } => {
hash_directory(path, include_patterns.as_deref(), exclude_patterns.as_deref())
}
ContentReference::Composite { references } => {
let mut hasher = Sha256::new();
for ref_item in references {
hasher.update(hash_content_reference(ref_item)?.as_bytes());
}
Ok(format!("{:x}", hasher.finalize()))
}
}
}
// ============================================================================
// HASH STORAGE
// ============================================================================
fn determine_storage(hash: &str, base_dir: &Path) -> HashStorage {
if hash.len() > INLINE_HASH_THRESHOLD {
let hash_dir = base_dir.join(HASH_STORAGE_DIR);
let hash_file = hash_dir.join(format!("{}{}", &hash[..16], HASH_FILE_EXT));
HashStorage::External { hash_file }
} else {
HashStorage::Inline { hash: hash.to_string() }
}
}
async fn store_hash(hash: &str, storage: &HashStorage) -> Result<()> {
if let HashStorage::External { hash_file } = storage {
if let Some(parent) = hash_file.parent() {
async_fs::create_dir_all(parent).await?;
}
async_fs::write(hash_file, hash.as_bytes()).await?;
}
Ok(())
}
async fn load_hash(storage: &HashStorage) -> Result<String> {
match storage {
HashStorage::Inline { hash } => Ok(hash.clone()),
HashStorage::External { hash_file } => {
Ok(async_fs::read_to_string(hash_file).await?.trim().to_string())
}
}
}
// ============================================================================
// VALIDATION
// ============================================================================
/// Validate a single state entry
async fn validate_entry(entry: &StateEntry) -> Result<ValidationStatus> {
// Check if completed
if !entry.completed {
return Ok(ValidationStatus::Unknown);
}
// Get content reference and hash
let (content_ref, hash_storage) = match (&entry.content_reference, &entry.content_hash) {
(Some(r), Some(h)) => (r, h),
_ => return Ok(ValidationStatus::Unknown),
};
// Load stored hash
let stored_hash = load_hash(hash_storage).await?;
// Compute current hash
let current_hash = match hash_content_reference(content_ref) {
Ok(h) => h,
Err(e) => return Ok(ValidationStatus::Invalid {
reason: format!("Failed to compute hash: {}", e)
}),
};
// Check hash match
if stored_hash != current_hash {
return Ok(ValidationStatus::Invalid { reason: "Hash mismatch".to_string() });
}
// Check TTL
if let Some(stage) = entry.data_stage {
let ttl = entry.ttl_override.unwrap_or_else(|| stage.default_ttl());
if let Some(completed_at) = entry.completed_at {
if Utc::now() - completed_at > ttl {
return Ok(ValidationStatus::Expired);
}
}
}
Ok(ValidationStatus::Valid)
}
/// Validate all entries with cascade invalidation
async fn validate_all_entries(entries: &mut HashMap<String, StateEntry>) -> Result<ValidationReport> {
let mut report = ValidationReport::default();
// Validate each entry
for (name, entry) in entries.iter_mut() {
let status = validate_entry(entry).await?;
entry.validation_status = status.clone();
entry.last_validated_at = Some(Utc::now());
match status {
ValidationStatus::Valid => report.valid_count += 1,
ValidationStatus::Invalid { .. } => {
report.invalid_count += 1;
report.invalid_entries.push(name.clone());
}
ValidationStatus::Expired => {
report.expired_count += 1;
report.expired_entries.push(name.clone());
}
ValidationStatus::Unknown => report.unknown_count += 1,
ValidationStatus::DependencyFailed { .. } => {}
}
}
// Cascade invalidation
let mut invalidated: HashSet<String> = report.invalid_entries.iter().cloned().collect();
loop {
let mut newly_invalidated = Vec::new();
for (name, entry) in entries.iter() {
if invalidated.contains(name) {
continue;
}
// Check if any dependency is invalidated
if let Some(failed_dep) = entry.dependencies.iter().find(|d| invalidated.contains(*d)) {
newly_invalidated.push((name.clone(), failed_dep.clone()));
}
}
if newly_invalidated.is_empty() {
break;
}
for (name, failed_dep) in newly_invalidated {
invalidated.insert(name.clone());
report.cascaded_invalidations.push(name.clone());
if let Some(entry) = entries.get_mut(&name) {
entry.validation_status = ValidationStatus::DependencyFailed { failed_dependency: failed_dep };
}
}
}
Ok(report)
}
#[derive(Debug, Default)]
pub struct ValidationReport {
pub valid_count: usize,
pub invalid_count: usize,
pub expired_count: usize,
pub unknown_count: usize,
pub invalid_entries: Vec<String>,
pub expired_entries: Vec<String>,
pub cascaded_invalidations: Vec<String>,
}
impl ValidationReport {
pub fn print_summary(&self) {
println!("=== Validation Report ===");
println!("Valid: {}", self.valid_count);
println!("Invalid: {}", self.invalid_count);
println!("Expired: {}", self.expired_count);
println!("Unknown: {}", self.unknown_count);
if !self.invalid_entries.is_empty() {
println!("\nInvalid entries:");
for entry in &self.invalid_entries {
println!(" - {}", entry);
}
}
if !self.expired_entries.is_empty() {
println!("\nExpired entries:");
for entry in &self.expired_entries {
println!(" - {}", entry);
}
}
if !self.cascaded_invalidations.is_empty() {
println!("\nCascaded invalidations:");
for entry in &self.cascaded_invalidations {
println!(" - {}", entry);
}
}
}
}
// ============================================================================
// STATE MANAGEMENT
// ============================================================================
/// State manager with centralized dependency configuration
///
/// # Orchestration: Shutdown Flag + State Management
///
/// ## Happy Path (Normal Completion)
/// 1. Work completes successfully
/// 2. Call `update_entry()` with `completed: true`
/// 3. StateEntry saved with timestamp and valid hash
/// 4. On next run: skips already-completed step
///
/// ## Shutdown Path (Interrupted Work)
/// 1. Shutdown flag is set via Ctrl+C handler
/// 2. Long-running code checks: `if shutdown_flag.load(Ordering::SeqCst) { break }`
/// 3. Before returning, call `mark_invalid()`
/// 4. StateEntry saved with `completed: false` and ValidationStatus::Invalid
/// 5. On next run: retries invalid step
///
/// ## Usage Pattern
///
/// ```rust
/// let manager = StateManager::new(&paths.integrity_dir()).await?;
/// let content_ref = directory_reference(&output_dir, None, None);
///
/// loop {
/// if shutdown_flag.load(Ordering::SeqCst) {
/// manager.mark_invalid(
/// step_name.to_string(),
/// Some(content_ref.clone()),
/// Some(DataStage::Data),
/// "invalid due to shutdown".to_string(),
/// ).await?;
/// return Ok(());
/// }
/// // Do work...
/// }
///
/// // Completed successfully
/// manager.update_entry(step_name.to_string(), content_ref, DataStage::Data, None).await?;
/// ```
pub struct StateManager {
base_dir: PathBuf,
dependency_config: DependencyConfig,
}
impl StateManager {
/// Create new state manager and load dependency configuration
pub async fn new<P: AsRef<Path>>(base_dir: P) -> Result<Self> {
let base_dir = base_dir.as_ref().to_path_buf();
let config_path = base_dir.join(DEFAULT_DEPENDENCY_CONFIG);
let dependency_config = DependencyConfig::from_file(config_path).await?;
Ok(Self { base_dir, dependency_config })
}
/// Create with explicit dependency configuration
pub fn with_config<P: AsRef<Path>>(base_dir: P, dependency_config: DependencyConfig) -> Result<Self> {
dependency_config.validate()?;
Ok(Self {
base_dir: base_dir.as_ref().to_path_buf(),
dependency_config,
})
}
/// Get the dependency configuration
pub fn get_dependency_config(&self) -> &DependencyConfig {
&self.dependency_config
}
/// Load all state entries from state.jsonl
pub async fn load_entries(&self) -> Result<HashMap<String, StateEntry>> {
let state_file = self.base_dir.join("state.jsonl");
if !state_file.exists() {
return Ok(HashMap::new());
}
let content = async_fs::read_to_string(&state_file).await?;
let mut entries = HashMap::new();
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(entry) = serde_json::from_str::<StateEntry>(line) {
entries.insert(entry.step_name.clone(), entry);
}
}
Ok(entries)
}
/// Save all state entries to state.jsonl
pub async fn save_entries(&self, entries: &HashMap<String, StateEntry>) -> Result<()> {
if let Some(parent) = self.base_dir.parent() {
async_fs::create_dir_all(parent).await?;
}
let mut file = async_fs::File::create(self.base_dir.join("state.jsonl")).await?;
for entry in entries.values() {
file.write_all((serde_json::to_string(&entry)? + "\n").as_bytes()).await?;
}
file.sync_all().await?;
Ok(())
}
/// Create an empty entry for a step (can be updated later)
///
/// Creates a placeholder entry that marks the step as incomplete and unknown,
/// allowing you to later mark it as valid or invalid via `mark_valid()` or `mark_invalid()`.
///
/// # Example
/// ```rust
/// let manager = StateManager::new(&paths.integrity_dir()).await?;
///
/// // Start tracking a long step
/// let mut entry = manager.create_entry("long_operation".to_string()).await?;
///
/// // Do work...
///
/// // Mark as valid when done
/// entry.content_reference = Some(content_ref);
/// entry.data_stage = Some(DataStage::Data);
/// manager.mark_valid(entry).await?;
/// ```
pub async fn create_entry(&self, step_name: String, content_reference: ContentReference, data_stage: DataStage) -> Result<StateEntry> {
// Resolve dependencies from configuration
let dependencies = self.dependency_config
.get_all_dependencies(&step_name)
.unwrap_or_default();
// Create empty entry with Unknown status
let entry = StateEntry {
step_name: step_name.clone(),
completed: false,
completed_at: None,
content_reference: Some(content_reference),
content_hash: None,
data_stage: Some(data_stage),
ttl_override: None,
last_validated_at: Some(Utc::now()),
validation_status: ValidationStatus::Unknown,
dependencies,
};
// Update and save
let mut entries = self.load_entries().await?;
entries.insert(step_name, entry.clone());
self.save_entries(&entries).await?;
Ok(entry)
}
/// Mark a StateEntry as valid and save to disk
///
/// Updates the entry with:
/// - `completed: true`
/// - `completed_at: now`
/// - `validation_status: Valid`
/// - Computes and stores content hash
///
/// # Requires
/// - `entry.content_reference` must be `Some()`
/// - `entry.data_stage` must be `Some()`
pub async fn mark_valid(&self, mut entry: StateEntry) -> Result<StateEntry> {
// Get content reference and data stage (required)
let content_reference = entry.content_reference.as_ref()
.ok_or_else(|| anyhow::anyhow!("content_reference is required to mark entry valid"))?;
let data_stage = entry.data_stage
.ok_or_else(|| anyhow::anyhow!("data_stage is required to mark entry valid"))?;
// Compute and store hash
let hash = hash_content_reference(content_reference)?;
let storage = determine_storage(&hash, &self.base_dir);
store_hash(&hash, &storage).await?;
// Update entry
entry.completed = true;
entry.completed_at = Some(Utc::now());
entry.content_hash = Some(storage);
entry.data_stage = Some(data_stage);
entry.last_validated_at = Some(Utc::now());
entry.validation_status = ValidationStatus::Valid;
// Save
let mut entries = self.load_entries().await?;
entries.insert(entry.step_name.clone(), entry.clone());
self.save_entries(&entries).await?;
Ok(entry)
}
/// Mark a StateEntry as invalid and save to disk
///
/// Updates the entry with:
/// - `completed: false`
/// - `completed_at: None`
/// - `validation_status: Invalid { reason }`
pub async fn mark_invalid(&self, mut entry: StateEntry, reason: String) -> Result<StateEntry> {
// Update entry
entry.completed = false;
entry.completed_at = None;
entry.last_validated_at = Some(Utc::now());
entry.validation_status = ValidationStatus::Invalid { reason };
// Save
let mut entries = self.load_entries().await?;
entries.insert(entry.step_name.clone(), entry.clone());
self.save_entries(&entries).await?;
Ok(entry)
}
/// Check if a step is valid and completed
pub async fn is_step_valid(&self, step_name: &str) -> Result<bool> {
let entries = self.load_entries().await?;
if let Some(entry) = entries.get(step_name) {
let status = validate_entry(entry).await?;
Ok(matches!(status, ValidationStatus::Valid))
} else {
Ok(false)
}
}
/// Run full validation on all entries
pub async fn validate_all(&self) -> Result<ValidationReport> {
let mut entries = self.load_entries().await?;
let report = validate_all_entries(&mut entries).await?;
self.save_entries(&entries).await?;
Ok(report)
}
/// Print dependency graph information
pub fn print_dependency_graph(&self) {
println!("=== Dependency Configuration ===");
println!("\nCheckpoints: {}", self.dependency_config.checkpoints.len());
println!("Groups: {}", self.dependency_config.groups.len());
println!("\n--- Checkpoints ---");
for (name, config) in &self.dependency_config.checkpoints {
println!("{}", name);
if !config.description.is_empty() {
println!(" Description: {}", config.description);
}
if let Some(group) = &config.group {
println!(" Group: {}", group);
}
if !config.depends_on.is_empty() {
println!(" Depends on: {}", config.depends_on.join(", "));
}
// Show resolved dependencies
if let Ok(all_deps) = self.dependency_config.get_all_dependencies(name) {
if !all_deps.is_empty() {
println!(" Resolved (including transitive): {}", all_deps.join(", "));
}
}
println!();
}
println!("\n--- Groups ---");
for (name, group) in &self.dependency_config.groups {
println!("{}", name);
if !group.description.is_empty() {
println!(" Description: {}", group.description);
}
println!(" Members: {}", group.members.join(", "));
if !group.depends_on.is_empty() {
println!(" Group dependencies: {}", group.depends_on.join(", "));
}
println!();
}
}
}
// ============================================================================
// HELPER FUNCTIONS
// ============================================================================
/// Create a simple file reference
pub fn file_reference<P: AsRef<Path>>(path: P) -> ContentReference {
ContentReference::File { path: path.as_ref().to_path_buf() }
}
/// Create a directory reference
pub fn directory_reference<P: AsRef<Path>>(
path: P,
include_patterns: Option<Vec<String>>,
exclude_patterns: Option<Vec<String>>,
) -> ContentReference {
ContentReference::Directory {
path: path.as_ref().to_path_buf(),
include_patterns,
exclude_patterns,
}
}
/// Create a composite reference
pub fn composite_reference(references: Vec<ContentReference>) -> ContentReference {
ContentReference::Composite { references }
}

View File

@@ -5,8 +5,6 @@ use tokio::sync::Mutex;
use std::fs::{self, OpenOptions}; use std::fs::{self, OpenOptions};
use std::io::Write; use std::io::Write;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
static LOGGER: Lazy<Mutex<Option<DebugLogger>>> = Lazy::new(|| Mutex::new(None)); static LOGGER: Lazy<Mutex<Option<DebugLogger>>> = Lazy::new(|| Mutex::new(None));
@@ -78,83 +76,3 @@ pub async fn log_warn(msg: &str) {
pub async fn log_error(msg: &str) { pub async fn log_error(msg: &str) {
log_detailed("ERROR", msg).await; log_detailed("ERROR", msg).await;
} }
struct PoolLogger {
file: std::fs::File,
log_path: PathBuf,
}
impl PoolLogger {
fn new(log_dir: &std::path::Path) -> std::io::Result<Self> {
fs::create_dir_all(log_dir)?;
let filename = format!("webdriver_{}.log", Local::now().format("%Y%m%d_%H%M%S"));
let log_path = log_dir.join(&filename);
let file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_path)?;
Ok(Self { file, log_path })
}
async fn log(&mut self, msg: &str) {
let line = format!("[{}] {}\n", Local::now().format("%H:%M:%S"), msg);
let _ = self.file.write_all(line.as_bytes());
let _ = self.file.flush();
println!("{}", line.trim_end());
}
}
pub struct PoolMetrics {
pub total_requests: Arc<AtomicUsize>,
pub successful_requests: Arc<AtomicUsize>,
pub failed_requests: Arc<AtomicUsize>,
pub session_renewals: Arc<AtomicUsize>,
pub rotation_events: Arc<AtomicUsize>,
pub retries: Arc<AtomicUsize>,
// IMPROVEMENT: Neue Metriken für besseres Monitoring
pub navigation_timeouts: Arc<AtomicUsize>,
pub bot_detection_hits: Arc<AtomicUsize>,
pub proxy_failures: Arc<AtomicUsize>,
}
impl PoolMetrics {
pub fn new() -> Self {
Self {
total_requests: Arc::new(AtomicUsize::new(0)),
successful_requests: Arc::new(AtomicUsize::new(0)),
failed_requests: Arc::new(AtomicUsize::new(0)),
session_renewals: Arc::new(AtomicUsize::new(0)),
rotation_events: Arc::new(AtomicUsize::new(0)),
retries: Arc::new(AtomicUsize::new(0)),
navigation_timeouts: Arc::new(AtomicUsize::new(0)),
bot_detection_hits: Arc::new(AtomicUsize::new(0)),
proxy_failures: Arc::new(AtomicUsize::new(0)),
}
}
pub async fn log_stats(&self) {
let total = self.total_requests.load(Ordering::Relaxed);
let success = self.successful_requests.load(Ordering::Relaxed);
// FIX: Prefix unused variable with underscore
let _failed = self.failed_requests.load(Ordering::Relaxed);
let renewals = self.session_renewals.load(Ordering::Relaxed);
let rotations = self.rotation_events.load(Ordering::Relaxed);
let retries = self.retries.load(Ordering::Relaxed);
let timeouts = self.navigation_timeouts.load(Ordering::Relaxed);
let bot_hits = self.bot_detection_hits.load(Ordering::Relaxed);
let proxy_fails = self.proxy_failures.load(Ordering::Relaxed);
let success_rate = if total > 0 {
(success as f64 / total as f64) * 100.0
} else {
0.0
};
crate::util::logger::log_info(&format!(
"Pool Metrics: {} total requests, {:.1}% success rate, {} renewals, {} rotations, {} retries, {} timeouts, {} bot detections, {} proxy failures",
total, success_rate, renewals, rotations, retries, timeouts, bot_hits, proxy_fails
)).await;
}
}

28
src/util/macros.rs Normal file
View File

@@ -0,0 +1,28 @@
// src/macros.rs
#[macro_export]
macro_rules! check_shutdown {
($shutdown_flag:expr) => {
if $shutdown_flag.load(std::sync::atomic::Ordering::SeqCst) {
logger::log_warn("Shutdown detected, stopping processes").await;
return Ok(());
}
};
}
/// Mark incomplete state on shutdown
/// Usage: mark_incomplete_on_shutdown!(&manager, "step_name", content_ref, DataStage::Data, &shutdown_flag)?;
#[macro_export]
macro_rules! mark_incomplete_on_shutdown {
($manager:expr, $step_name:expr, $content_ref:expr, $data_stage:expr, $shutdown_flag:expr) => {
if $shutdown_flag.load(std::sync::atomic::Ordering::SeqCst) {
$manager
.mark_incomplete(
$step_name.to_string(),
$content_ref,
$data_stage,
"Incomplete due to shutdown".to_string(),
)
.await?;
}
};
}

View File

@@ -2,3 +2,5 @@
pub mod logger; pub mod logger;
pub mod directories; pub mod directories;
pub mod opnv; pub mod opnv;
pub mod macros;
pub mod integrity;