Compare commits
5 Commits
d26e833d93
...
5e81959322
| Author | SHA1 | Date | |
|---|---|---|---|
| 5e81959322 | |||
| b366f366e6 | |||
| cd91de253b | |||
| c51b36c125 | |||
| 9c66f0d361 |
38
.env.example
38
.env.example
@@ -3,46 +3,38 @@
|
|||||||
# This file configures the behavior of the WebScraper application
|
# This file configures the behavior of the WebScraper application
|
||||||
# Copy to .env and adjust values as needed
|
# Copy to .env and adjust values as needed
|
||||||
|
|
||||||
# ===== ECONOMIC DATA =====
|
OPENFIGI_API_KEY=
|
||||||
# Start date for economic event scraping
|
|
||||||
|
# Economic calendar start (usually the earliest available on finanzen.net)
|
||||||
ECONOMIC_START_DATE=2007-02-13
|
ECONOMIC_START_DATE=2007-02-13
|
||||||
|
|
||||||
# How far into the future to look ahead for economic events (in months)
|
# Corporate earnings & price history start
|
||||||
ECONOMIC_LOOKAHEAD_MONTHS=3
|
|
||||||
|
|
||||||
# ===== CORPORATE DATA =====
|
|
||||||
# Start date for corporate earnings/data scraping
|
|
||||||
CORPORATE_START_DATE=2010-01-01
|
CORPORATE_START_DATE=2010-01-01
|
||||||
|
|
||||||
# ===== PERFORMANCE & CONCURRENCY =====
|
# How far into the future we scrape economic events (in months)
|
||||||
# Maximum number of parallel ChromeDriver instances
|
ECONOMIC_LOOKAHEAD_MONTHS=3
|
||||||
# Higher = more concurrent tasks, but higher resource usage
|
|
||||||
MAX_PARALLEL_INSTANCES=3
|
|
||||||
|
|
||||||
# Maximum tasks per ChromeDriver instance before recycling
|
# Maximum number of parallel scraping tasks (default: 10)
|
||||||
# 0 = unlimited (instance lives for entire application runtime)
|
MAX_PARALLEL_TASKS=10
|
||||||
MAX_TASKS_PER_INSTANCE=0
|
|
||||||
|
|
||||||
# ===== VPN ROTATION (ProtonVPN Integration) =====
|
# ===== VPN ROTATION (ProtonVPN Integration) =====
|
||||||
# Enable automatic VPN rotation between sessions?
|
# Enable automatic VPN rotation between sessions?
|
||||||
# If false, all traffic goes through system without VPN tunneling
|
# If false, all traffic goes through system without VPN tunneling
|
||||||
ENABLE_VPN_ROTATION=false
|
ENABLE_VPN_ROTATION=true
|
||||||
|
|
||||||
# Comma-separated list of ProtonVPN servers to rotate through
|
|
||||||
# Examples:
|
|
||||||
# "US-Free#1,US-Free#2,UK-Free#1"
|
|
||||||
# "US,UK,JP,DE,NL"
|
|
||||||
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
|
||||||
VPN_SERVERS=
|
|
||||||
|
|
||||||
# Number of tasks per VPN session before rotating to new server/IP
|
# Number of tasks per VPN session before rotating to new server/IP
|
||||||
# 0 = rotate between economic and corporate phases (one phase = one IP)
|
# 0 = rotate between economic and corporate phases (one phase = one IP)
|
||||||
# 5 = rotate every 5 tasks
|
# 5 = rotate every 5 tasks
|
||||||
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
||||||
TASKS_PER_VPN_SESSION=0
|
TASKS_PER_VPN_SESSION=50
|
||||||
|
|
||||||
# ===== LOGGING =====
|
# ===== LOGGING =====
|
||||||
# Set via RUST_LOG environment variable:
|
# Set via RUST_LOG environment variable:
|
||||||
# RUST_LOG=info cargo run
|
# RUST_LOG=info cargo run
|
||||||
# RUST_LOG=debug cargo run
|
# RUST_LOG=debug cargo run
|
||||||
# Leave empty or unset for default logging level
|
# Leave empty or unset for default logging level
|
||||||
|
|
||||||
|
|
||||||
|
MAX_REQUESTS_PER_SESSION=25
|
||||||
|
MIN_REQUEST_INTERVAL_MS=300
|
||||||
|
MAX_RETRY_ATTEMPTS=3
|
||||||
212
Cargo.lock
generated
212
Cargo.lock
generated
@@ -110,6 +110,17 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-trait"
|
||||||
|
version = "0.1.89"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.110",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "atomic-waker"
|
name = "atomic-waker"
|
||||||
version = "1.1.2"
|
version = "1.1.2"
|
||||||
@@ -122,6 +133,64 @@ version = "1.5.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "axum"
|
||||||
|
version = "0.7.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||||
|
dependencies = [
|
||||||
|
"async-trait",
|
||||||
|
"axum-core",
|
||||||
|
"base64 0.22.1",
|
||||||
|
"bytes",
|
||||||
|
"futures-util",
|
||||||
|
"http 1.3.1",
|
||||||
|
"http-body 1.0.1",
|
||||||
|
"http-body-util",
|
||||||
|
"hyper 1.8.1",
|
||||||
|
"hyper-util",
|
||||||
|
"itoa",
|
||||||
|
"matchit",
|
||||||
|
"memchr",
|
||||||
|
"mime",
|
||||||
|
"percent-encoding",
|
||||||
|
"pin-project-lite",
|
||||||
|
"rustversion",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"serde_path_to_error",
|
||||||
|
"serde_urlencoded",
|
||||||
|
"sha1",
|
||||||
|
"sync_wrapper",
|
||||||
|
"tokio",
|
||||||
|
"tokio-tungstenite 0.24.0",
|
||||||
|
"tower",
|
||||||
|
"tower-layer",
|
||||||
|
"tower-service",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "axum-core"
|
||||||
|
version = "0.4.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||||
|
dependencies = [
|
||||||
|
"async-trait",
|
||||||
|
"bytes",
|
||||||
|
"futures-util",
|
||||||
|
"http 1.3.1",
|
||||||
|
"http-body 1.0.1",
|
||||||
|
"http-body-util",
|
||||||
|
"mime",
|
||||||
|
"pin-project-lite",
|
||||||
|
"rustversion",
|
||||||
|
"sync_wrapper",
|
||||||
|
"tower-layer",
|
||||||
|
"tower-service",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base64"
|
name = "base64"
|
||||||
version = "0.21.7"
|
version = "0.21.7"
|
||||||
@@ -660,34 +729,6 @@ dependencies = [
|
|||||||
"windows-sys 0.61.2",
|
"windows-sys 0.61.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "event_backtest_engine"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"chrono",
|
|
||||||
"csv",
|
|
||||||
"dotenvy",
|
|
||||||
"fantoccini",
|
|
||||||
"flate2",
|
|
||||||
"futures",
|
|
||||||
"once_cell",
|
|
||||||
"rand 0.9.2",
|
|
||||||
"rayon",
|
|
||||||
"regex",
|
|
||||||
"reqwest",
|
|
||||||
"scraper",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"tokio",
|
|
||||||
"tracing",
|
|
||||||
"tracing-subscriber",
|
|
||||||
"url",
|
|
||||||
"walkdir",
|
|
||||||
"yfinance-rs",
|
|
||||||
"zip",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fantoccini"
|
name = "fantoccini"
|
||||||
version = "0.20.0"
|
version = "0.20.0"
|
||||||
@@ -1099,6 +1140,7 @@ dependencies = [
|
|||||||
"http 1.3.1",
|
"http 1.3.1",
|
||||||
"http-body 1.0.1",
|
"http-body 1.0.1",
|
||||||
"httparse",
|
"httparse",
|
||||||
|
"httpdate",
|
||||||
"itoa",
|
"itoa",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"pin-utils",
|
"pin-utils",
|
||||||
@@ -1522,6 +1564,12 @@ dependencies = [
|
|||||||
"regex-automata",
|
"regex-automata",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "matchit"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.7.6"
|
version = "2.7.6"
|
||||||
@@ -2684,6 +2732,17 @@ dependencies = [
|
|||||||
"serde_core",
|
"serde_core",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_path_to_error"
|
||||||
|
version = "0.1.20"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457"
|
||||||
|
dependencies = [
|
||||||
|
"itoa",
|
||||||
|
"serde",
|
||||||
|
"serde_core",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_urlencoded"
|
name = "serde_urlencoded"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
@@ -3100,6 +3159,30 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokio-tungstenite"
|
||||||
|
version = "0.21.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
|
||||||
|
dependencies = [
|
||||||
|
"futures-util",
|
||||||
|
"log",
|
||||||
|
"tokio",
|
||||||
|
"tungstenite 0.21.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokio-tungstenite"
|
||||||
|
version = "0.24.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
|
||||||
|
dependencies = [
|
||||||
|
"futures-util",
|
||||||
|
"log",
|
||||||
|
"tokio",
|
||||||
|
"tungstenite 0.24.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio-tungstenite"
|
name = "tokio-tungstenite"
|
||||||
version = "0.28.0"
|
version = "0.28.0"
|
||||||
@@ -3113,7 +3196,7 @@ dependencies = [
|
|||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls 0.26.4",
|
"tokio-rustls 0.26.4",
|
||||||
"tungstenite",
|
"tungstenite 0.28.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3172,6 +3255,7 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tower-layer",
|
"tower-layer",
|
||||||
"tower-service",
|
"tower-service",
|
||||||
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3210,6 +3294,7 @@ version = "0.1.41"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"log",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"tracing-attributes",
|
"tracing-attributes",
|
||||||
"tracing-core",
|
"tracing-core",
|
||||||
@@ -3271,6 +3356,43 @@ version = "0.2.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tungstenite"
|
||||||
|
version = "0.21.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"bytes",
|
||||||
|
"data-encoding",
|
||||||
|
"http 1.3.1",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"sha1",
|
||||||
|
"thiserror 1.0.69",
|
||||||
|
"url",
|
||||||
|
"utf-8",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tungstenite"
|
||||||
|
version = "0.24.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"bytes",
|
||||||
|
"data-encoding",
|
||||||
|
"http 1.3.1",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"sha1",
|
||||||
|
"thiserror 1.0.69",
|
||||||
|
"utf-8",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tungstenite"
|
name = "tungstenite"
|
||||||
version = "0.28.0"
|
version = "0.28.0"
|
||||||
@@ -3484,6 +3606,36 @@ dependencies = [
|
|||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "web_scraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"axum",
|
||||||
|
"chrono",
|
||||||
|
"csv",
|
||||||
|
"dotenvy",
|
||||||
|
"fantoccini",
|
||||||
|
"flate2",
|
||||||
|
"futures",
|
||||||
|
"once_cell",
|
||||||
|
"rand 0.9.2",
|
||||||
|
"rayon",
|
||||||
|
"regex",
|
||||||
|
"reqwest",
|
||||||
|
"scraper",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"tokio",
|
||||||
|
"tokio-tungstenite 0.21.0",
|
||||||
|
"tracing",
|
||||||
|
"tracing-subscriber",
|
||||||
|
"url",
|
||||||
|
"walkdir",
|
||||||
|
"yfinance-rs",
|
||||||
|
"zip",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webdriver"
|
name = "webdriver"
|
||||||
version = "0.50.0"
|
version = "0.50.0"
|
||||||
@@ -3798,7 +3950,7 @@ dependencies = [
|
|||||||
"serde_json",
|
"serde_json",
|
||||||
"thiserror 2.0.17",
|
"thiserror 2.0.17",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-tungstenite",
|
"tokio-tungstenite 0.28.0",
|
||||||
"url",
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "event_backtest_engine"
|
name = "web_scraper"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
authors = ["Your Name <you@example.com>"]
|
authors = ["Your Name <you@example.com>"]
|
||||||
@@ -54,3 +54,7 @@ once_cell = "1.21.3"
|
|||||||
# Parallel processing (for batch tickers)
|
# Parallel processing (for batch tickers)
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
rayon = "1.10" # optional: for parallel price downloads
|
rayon = "1.10" # optional: for parallel price downloads
|
||||||
|
|
||||||
|
# Web server for dashboard
|
||||||
|
axum = { version = "0.7", features = ["ws"] }
|
||||||
|
tokio-tungstenite = "0.21" # For WebSocket support
|
||||||
@@ -1,32 +1,52 @@
|
|||||||
|
// src/config.rs - FIXED VERSION
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::{self};
|
use chrono::{self};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct Config {
|
pub struct Config {
|
||||||
// Economic calendar start (usually the earliest available on finanzen.net)
|
pub economic_start_date: String,
|
||||||
pub economic_start_date: String, // e.g. "2007-02-13"
|
pub corporate_start_date: String,
|
||||||
// Corporate earnings & price history start
|
pub economic_lookahead_months: u32,
|
||||||
pub corporate_start_date: String, // e.g. "2000-01-01" or "2010-01-01"
|
|
||||||
// How far into the future we scrape economic events
|
|
||||||
pub economic_lookahead_months: u32, // default: 3
|
|
||||||
/// Maximum number of parallel scraping tasks (default: 10).
|
|
||||||
/// This limits concurrency to protect system load and prevent website spamming.
|
|
||||||
#[serde(default = "default_max_parallel_instances")]
|
#[serde(default = "default_max_parallel_instances")]
|
||||||
pub max_parallel_instances: usize,
|
pub max_parallel_instances: usize,
|
||||||
|
|
||||||
pub max_tasks_per_instance: usize,
|
pub max_tasks_per_instance: usize,
|
||||||
|
|
||||||
/// VPN rotation configuration
|
#[serde(default = "default_enable_vpn_rotation")]
|
||||||
/// If set to "true", enables automatic VPN rotation between sessions
|
|
||||||
#[serde(default)]
|
|
||||||
pub enable_vpn_rotation: bool,
|
pub enable_vpn_rotation: bool,
|
||||||
|
|
||||||
|
// IMPROVEMENT: Reduzierte Defaults für weniger aggressive Scraping
|
||||||
|
#[serde(default = "default_max_requests_per_session")]
|
||||||
|
pub max_requests_per_session: usize,
|
||||||
|
|
||||||
|
#[serde(default = "default_min_request_interval_ms")]
|
||||||
|
pub min_request_interval_ms: u64,
|
||||||
|
|
||||||
|
#[serde(default = "default_max_retry_attempts")]
|
||||||
|
pub max_retry_attempts: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_enable_vpn_rotation() -> bool {
|
||||||
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_max_parallel_instances() -> usize {
|
fn default_max_parallel_instances() -> usize {
|
||||||
|
4
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_requests_per_session() -> usize {
|
||||||
10
|
10
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn default_min_request_interval_ms() -> u64 {
|
||||||
|
1200
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_retry_attempts() -> u32 { 3 }
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -35,26 +55,19 @@ impl Default for Config {
|
|||||||
economic_lookahead_months: 3,
|
economic_lookahead_months: 3,
|
||||||
max_parallel_instances: default_max_parallel_instances(),
|
max_parallel_instances: default_max_parallel_instances(),
|
||||||
max_tasks_per_instance: 0,
|
max_tasks_per_instance: 0,
|
||||||
|
max_requests_per_session: default_max_requests_per_session(),
|
||||||
|
min_request_interval_ms: default_min_request_interval_ms(),
|
||||||
|
max_retry_attempts: default_max_retry_attempts(),
|
||||||
enable_vpn_rotation: false,
|
enable_vpn_rotation: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
/// Loads the configuration from environment variables using dotenvy.
|
/// Loads configuration from environment variables using dotenvy.
|
||||||
///
|
|
||||||
/// This function loads a `.env` file if present (via `dotenvy::dotenv()`),
|
|
||||||
/// then retrieves each configuration value from environment variables.
|
|
||||||
/// If a variable is missing, it falls back to the default value.
|
|
||||||
/// Variable names are uppercase with underscores (e.g., ECONOMIC_START_DATE).
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// The loaded Config on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if parsing fails (e.g., invalid integer for lookahead months).
|
|
||||||
pub fn load() -> Result<Self> {
|
pub fn load() -> Result<Self> {
|
||||||
// Load .env file if it exists; ignore if not found (dotenvy::dotenv returns Ok if no file)
|
|
||||||
let _ = dotenvy::dotenv().context("Failed to load .env file (optional)")?;
|
let _ = dotenvy::dotenv().context("Failed to load .env file (optional)")?;
|
||||||
|
|
||||||
let economic_start_date = dotenvy::var("ECONOMIC_START_DATE")
|
let economic_start_date = dotenvy::var("ECONOMIC_START_DATE")
|
||||||
@@ -68,13 +81,14 @@ impl Config {
|
|||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
||||||
|
|
||||||
|
// IMPROVEMENT: Reduzierte Defaults
|
||||||
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
|
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
|
||||||
.unwrap_or_else(|_| "10".to_string())
|
.unwrap_or_else(|_| "4".to_string()) // Geändert von 10
|
||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
|
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
|
||||||
|
|
||||||
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
|
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
|
||||||
.unwrap_or_else(|_| "0".to_string())
|
.unwrap_or_else(|_| "5".to_string()) // Geändert von 0
|
||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
|
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
|
||||||
|
|
||||||
@@ -83,6 +97,21 @@ impl Config {
|
|||||||
.parse::<bool>()
|
.parse::<bool>()
|
||||||
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
||||||
|
|
||||||
|
let max_requests_per_session: usize = dotenvy::var("MAX_REQUESTS_PER_SESSION")
|
||||||
|
.unwrap_or_else(|_| "10".to_string()) // Geändert von 25
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_REQUESTS_PER_SESSION as usize")?;
|
||||||
|
|
||||||
|
let min_request_interval_ms: u64 = dotenvy::var("MIN_REQUEST_INTERVAL_MS")
|
||||||
|
.unwrap_or_else(|_| "1200".to_string()) // Geändert von 300
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MIN_REQUEST_INTERVAL_MS as u64")?;
|
||||||
|
|
||||||
|
let max_retry_attempts: u32 = dotenvy::var("MAX_RETRY_ATTEMPTS")
|
||||||
|
.unwrap_or_else(|_| "3".to_string())
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_RETRY_ATTEMPTS as u32")?;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
economic_start_date,
|
economic_start_date,
|
||||||
corporate_start_date,
|
corporate_start_date,
|
||||||
@@ -90,6 +119,9 @@ impl Config {
|
|||||||
max_parallel_instances,
|
max_parallel_instances,
|
||||||
max_tasks_per_instance,
|
max_tasks_per_instance,
|
||||||
enable_vpn_rotation,
|
enable_vpn_rotation,
|
||||||
|
max_requests_per_session,
|
||||||
|
min_request_interval_ms,
|
||||||
|
max_retry_attempts,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
346
src/corporate/atomic_writer.rs
Normal file
346
src/corporate/atomic_writer.rs
Normal file
@@ -0,0 +1,346 @@
|
|||||||
|
// src/corporate/atomic_writer.rs
|
||||||
|
//
|
||||||
|
// Atomic JSONL writer that prevents partial/corrupted results from being written
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use serde::Serialize;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::fs::{File, OpenOptions};
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
|
||||||
|
/// Command to write or validate data
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum WriteCommand<T> {
|
||||||
|
/// Stage a result for writing (held in memory until committed)
|
||||||
|
Stage { id: String, data: T },
|
||||||
|
|
||||||
|
/// Commit staged result to disk (atomic write)
|
||||||
|
Commit { id: String },
|
||||||
|
|
||||||
|
/// Rollback staged result (discard without writing)
|
||||||
|
Rollback { id: String },
|
||||||
|
|
||||||
|
/// Commit all pending staged results and flush
|
||||||
|
CommitAll,
|
||||||
|
|
||||||
|
/// Shutdown writer gracefully (only commits valid staged results)
|
||||||
|
Shutdown,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of a write operation
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct WriteResult {
|
||||||
|
pub id: String,
|
||||||
|
pub success: bool,
|
||||||
|
pub error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Atomic writer that prevents partial results from being written
|
||||||
|
pub struct AtomicJsonlWriter<T> {
|
||||||
|
file: File,
|
||||||
|
staged: HashMap<String, T>,
|
||||||
|
committed_count: usize,
|
||||||
|
rollback_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Serialize + Clone> AtomicJsonlWriter<T> {
|
||||||
|
pub async fn new(path: PathBuf) -> Result<Self> {
|
||||||
|
// Ensure parent directory exists
|
||||||
|
if let Some(parent) = path.parent() {
|
||||||
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&path)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Atomic writer initialized: {:?}",
|
||||||
|
path
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
file,
|
||||||
|
staged: HashMap::new(),
|
||||||
|
committed_count: 0,
|
||||||
|
rollback_count: 0,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stage data for writing (held in memory, not yet written)
|
||||||
|
pub async fn stage(&mut self, id: String, data: T) {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Staging result for: {} (total staged: {})",
|
||||||
|
id,
|
||||||
|
self.staged.len() + 1
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
self.staged.insert(id, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commit a staged result to disk (atomic write)
|
||||||
|
pub async fn commit(&mut self, id: &str) -> Result<()> {
|
||||||
|
if let Some(data) = self.staged.remove(id) {
|
||||||
|
// Serialize to JSON
|
||||||
|
let json_line = serde_json::to_string(&data)?;
|
||||||
|
|
||||||
|
// Write atomically (single syscall)
|
||||||
|
self.file.write_all(json_line.as_bytes()).await?;
|
||||||
|
self.file.write_all(b"\n").await?;
|
||||||
|
self.file.flush().await?;
|
||||||
|
|
||||||
|
self.committed_count += 1;
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"✓ Committed result for: {} (total committed: {})",
|
||||||
|
id, self.committed_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(anyhow::anyhow!("No staged result found for id: {}", id))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rollback a staged result (discard without writing)
|
||||||
|
pub async fn rollback(&mut self, id: &str) {
|
||||||
|
if self.staged.remove(id).is_some() {
|
||||||
|
self.rollback_count += 1;
|
||||||
|
|
||||||
|
crate::util::logger::log_warn(&format!(
|
||||||
|
"⚠ Rolled back result for: {} (total rollbacks: {})",
|
||||||
|
id, self.rollback_count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commit all staged results
|
||||||
|
pub async fn commit_all(&mut self) -> Result<usize> {
|
||||||
|
let ids: Vec<String> = self.staged.keys().cloned().collect();
|
||||||
|
let mut committed = 0;
|
||||||
|
|
||||||
|
for id in ids {
|
||||||
|
if let Ok(()) = self.commit(&id).await {
|
||||||
|
committed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(committed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rollback all staged results (discard everything)
|
||||||
|
pub async fn rollback_all(&mut self) -> usize {
|
||||||
|
let count = self.staged.len();
|
||||||
|
self.staged.clear();
|
||||||
|
self.rollback_count += count;
|
||||||
|
|
||||||
|
crate::util::logger::log_warn(&format!(
|
||||||
|
"⚠ Rolled back all {} staged results",
|
||||||
|
count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
count
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get statistics
|
||||||
|
pub fn stats(&self) -> WriterStats {
|
||||||
|
WriterStats {
|
||||||
|
staged_count: self.staged.len(),
|
||||||
|
committed_count: self.committed_count,
|
||||||
|
rollback_count: self.rollback_count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct WriterStats {
|
||||||
|
pub staged_count: usize,
|
||||||
|
pub committed_count: usize,
|
||||||
|
pub rollback_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Managed writer service that runs in its own task
|
||||||
|
pub struct AtomicWriterService<T> {
|
||||||
|
rx: mpsc::UnboundedReceiver<WriteCommand<T>>,
|
||||||
|
writer: AtomicJsonlWriter<T>,
|
||||||
|
shutdown_flag: Arc<AtomicBool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Serialize + Clone> AtomicWriterService<T> {
|
||||||
|
pub async fn new(
|
||||||
|
path: PathBuf,
|
||||||
|
rx: mpsc::UnboundedReceiver<WriteCommand<T>>,
|
||||||
|
shutdown_flag: Arc<AtomicBool>,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let writer = AtomicJsonlWriter::new(path).await?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
rx,
|
||||||
|
writer,
|
||||||
|
shutdown_flag,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Main service loop
|
||||||
|
pub async fn run(mut self) {
|
||||||
|
crate::util::logger::log_info("Atomic writer service started").await;
|
||||||
|
|
||||||
|
while let Some(cmd) = self.rx.recv().await {
|
||||||
|
// Check for shutdown flag
|
||||||
|
if self.shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
crate::util::logger::log_warn(
|
||||||
|
"Shutdown detected - processing only Commit/Rollback commands"
|
||||||
|
).await;
|
||||||
|
|
||||||
|
// Only process commit/rollback commands during shutdown
|
||||||
|
match cmd {
|
||||||
|
WriteCommand::Commit { id } => {
|
||||||
|
if let Err(e) = self.writer.commit(&id).await {
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Failed to commit {}: {}",
|
||||||
|
id, e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WriteCommand::Rollback { id } => {
|
||||||
|
self.writer.rollback(&id).await;
|
||||||
|
}
|
||||||
|
WriteCommand::CommitAll => {
|
||||||
|
match self.writer.commit_all().await {
|
||||||
|
Ok(count) => {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Committed {} results during shutdown",
|
||||||
|
count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Failed to commit all: {}",
|
||||||
|
e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WriteCommand::Shutdown => break,
|
||||||
|
_ => {
|
||||||
|
// Ignore Stage commands during shutdown
|
||||||
|
crate::util::logger::log_warn(
|
||||||
|
"Ignoring new Stage command during shutdown"
|
||||||
|
).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normal operation
|
||||||
|
match cmd {
|
||||||
|
WriteCommand::Stage { id, data } => {
|
||||||
|
self.writer.stage(id, data).await;
|
||||||
|
}
|
||||||
|
WriteCommand::Commit { id } => {
|
||||||
|
if let Err(e) = self.writer.commit(&id).await {
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Failed to commit {}: {}",
|
||||||
|
id, e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WriteCommand::Rollback { id } => {
|
||||||
|
self.writer.rollback(&id).await;
|
||||||
|
}
|
||||||
|
WriteCommand::CommitAll => {
|
||||||
|
match self.writer.commit_all().await {
|
||||||
|
Ok(count) => {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Committed all {} staged results",
|
||||||
|
count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Failed to commit all: {}",
|
||||||
|
e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WriteCommand::Shutdown => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final shutdown - rollback any remaining staged items
|
||||||
|
let stats = self.writer.stats();
|
||||||
|
if stats.staged_count > 0 {
|
||||||
|
crate::util::logger::log_warn(&format!(
|
||||||
|
"⚠ Shutdown with {} uncommitted results - rolling back",
|
||||||
|
stats.staged_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
self.writer.rollback_all().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Atomic writer service stopped. Final stats: {} committed, {} rolled back",
|
||||||
|
stats.committed_count,
|
||||||
|
stats.rollback_count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle for sending write commands
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct AtomicWriterHandle<T> {
|
||||||
|
tx: mpsc::UnboundedSender<WriteCommand<T>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> AtomicWriterHandle<T> {
|
||||||
|
pub fn new(tx: mpsc::UnboundedSender<WriteCommand<T>>) -> Self {
|
||||||
|
Self { tx }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stage data for writing (does not write immediately)
|
||||||
|
pub fn stage(&self, id: String, data: T) {
|
||||||
|
let _ = self.tx.send(WriteCommand::Stage { id, data });
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commit staged data to disk
|
||||||
|
pub fn commit(&self, id: String) {
|
||||||
|
let _ = self.tx.send(WriteCommand::Commit { id });
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rollback staged data (discard)
|
||||||
|
pub fn rollback(&self, id: String) {
|
||||||
|
let _ = self.tx.send(WriteCommand::Rollback { id });
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commit all staged data
|
||||||
|
pub fn commit_all(&self) {
|
||||||
|
let _ = self.tx.send(WriteCommand::CommitAll);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shutdown writer gracefully
|
||||||
|
pub fn shutdown(&self) {
|
||||||
|
let _ = self.tx.send(WriteCommand::Shutdown);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create atomic writer service
|
||||||
|
pub async fn create_atomic_writer<T: Serialize + Clone + Send + 'static>(
|
||||||
|
path: PathBuf,
|
||||||
|
shutdown_flag: Arc<AtomicBool>,
|
||||||
|
) -> Result<(AtomicWriterHandle<T>, tokio::task::JoinHandle<()>)> {
|
||||||
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
|
|
||||||
|
let service = AtomicWriterService::new(path, rx, shutdown_flag).await?;
|
||||||
|
let handle = tokio::spawn(async move {
|
||||||
|
service.run().await;
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok((AtomicWriterHandle::new(tx), handle))
|
||||||
|
}
|
||||||
@@ -2,6 +2,8 @@
|
|||||||
use super::types::*;
|
use super::types::*;
|
||||||
use chrono::{Local, NaiveDate};
|
use chrono::{Local, NaiveDate};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::prelude::{Rng, SeedableRng, IndexedRandom};
|
||||||
|
|
||||||
pub fn event_key(e: &CompanyEvent) -> String {
|
pub fn event_key(e: &CompanyEvent) -> String {
|
||||||
format!("{}|{}|{}", e.ticker, e.date, e.time)
|
format!("{}|{}|{}", e.ticker, e.date, e.time)
|
||||||
@@ -68,3 +70,15 @@ pub fn parse_yahoo_date(s: &str) -> anyhow::Result<NaiveDate> {
|
|||||||
.or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y"))
|
.or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y"))
|
||||||
.map_err(|_| anyhow::anyhow!("Bad date: {s}"))
|
.map_err(|_| anyhow::anyhow!("Bad date: {s}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Send-safe random range
|
||||||
|
pub fn random_range(min: u64, max: u64) -> u64 {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
rng.gen_range(min..max)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send-safe random choice
|
||||||
|
pub fn choose_random<T: Clone>(items: &[T]) -> T {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
items.choose(&mut rng).unwrap().clone()
|
||||||
|
}
|
||||||
@@ -8,5 +8,8 @@ pub mod aggregation;
|
|||||||
pub mod fx;
|
pub mod fx;
|
||||||
pub mod openfigi;
|
pub mod openfigi;
|
||||||
pub mod yahoo;
|
pub mod yahoo;
|
||||||
|
pub mod update_parallel;
|
||||||
|
pub mod page_validation;
|
||||||
|
pub mod atomic_writer;
|
||||||
|
|
||||||
pub use update::run_full_update;
|
pub use update::run_full_update;
|
||||||
180
src/corporate/page_validation.rs
Normal file
180
src/corporate/page_validation.rs
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
// src/corporate/page_validation.rs
|
||||||
|
//
|
||||||
|
// Utilities to ensure page state is correct before extraction
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Result};
|
||||||
|
use fantoccini::Client;
|
||||||
|
use tokio::time::{sleep, Duration};
|
||||||
|
|
||||||
|
/// Validates that the browser navigated to the expected URL
|
||||||
|
///
|
||||||
|
/// This prevents extracting data from a stale page when navigation fails silently
|
||||||
|
pub async fn verify_navigation(
|
||||||
|
client: &Client,
|
||||||
|
expected_url_fragment: &str,
|
||||||
|
max_attempts: u32,
|
||||||
|
) -> Result<()> {
|
||||||
|
for attempt in 1..=max_attempts {
|
||||||
|
let current_url = client.current_url().await?;
|
||||||
|
let current = current_url.as_str();
|
||||||
|
|
||||||
|
if current.contains(expected_url_fragment) {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"✓ Navigation verified: {} (attempt {})",
|
||||||
|
current, attempt
|
||||||
|
)).await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
if attempt < max_attempts {
|
||||||
|
crate::util::logger::log_warn(&format!(
|
||||||
|
"Navigation mismatch (attempt {}): expected '{}', got '{}'. Retrying...",
|
||||||
|
attempt, expected_url_fragment, current
|
||||||
|
)).await;
|
||||||
|
sleep(Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let current_url = client.current_url().await?;
|
||||||
|
Err(anyhow!(
|
||||||
|
"Navigation verification failed: expected URL containing '{}', but got '{}'",
|
||||||
|
expected_url_fragment,
|
||||||
|
current_url.as_str()
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clears browser state by navigating to a blank page
|
||||||
|
///
|
||||||
|
/// Use this when a navigation fails or times out to ensure clean slate
|
||||||
|
pub async fn clear_browser_state(client: &Client) -> Result<()> {
|
||||||
|
crate::util::logger::log_info("Clearing browser state with about:blank").await;
|
||||||
|
|
||||||
|
// Navigate to blank page to clear any stale content
|
||||||
|
client.goto("about:blank").await?;
|
||||||
|
|
||||||
|
// Brief wait to ensure page clears
|
||||||
|
sleep(Duration::from_millis(200)).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validates that expected content exists on the page before extraction
|
||||||
|
///
|
||||||
|
/// This adds an extra safety check that the page actually loaded
|
||||||
|
pub async fn verify_page_content(
|
||||||
|
client: &Client,
|
||||||
|
content_checks: Vec<ContentCheck>,
|
||||||
|
) -> Result<()> {
|
||||||
|
for check in content_checks {
|
||||||
|
match check {
|
||||||
|
ContentCheck::ElementExists(selector) => {
|
||||||
|
let exists: bool = client
|
||||||
|
.execute(
|
||||||
|
&format!(
|
||||||
|
"return !!document.querySelector('{}');",
|
||||||
|
selector.replace("'", "\\'")
|
||||||
|
),
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if !exists {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Expected element '{}' not found on page",
|
||||||
|
selector
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ContentCheck::TextContains(text) => {
|
||||||
|
let page_text: String = client
|
||||||
|
.execute("return document.body.innerText;", vec![])
|
||||||
|
.await?
|
||||||
|
.as_str()
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
if !page_text.contains(&text) {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Expected text '{}' not found on page",
|
||||||
|
text
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum ContentCheck {
|
||||||
|
/// Verify that a CSS selector exists
|
||||||
|
ElementExists(String),
|
||||||
|
/// Verify that page body contains text
|
||||||
|
TextContains(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Safe navigation wrapper that validates and clears state on failure
|
||||||
|
pub async fn navigate_with_validation(
|
||||||
|
client: &Client,
|
||||||
|
url: &str,
|
||||||
|
expected_url_fragment: &str,
|
||||||
|
timeout_secs: u64,
|
||||||
|
) -> Result<()> {
|
||||||
|
use tokio::time::timeout;
|
||||||
|
|
||||||
|
// Attempt navigation with timeout
|
||||||
|
let nav_result = timeout(
|
||||||
|
Duration::from_secs(timeout_secs),
|
||||||
|
client.goto(url)
|
||||||
|
).await;
|
||||||
|
|
||||||
|
match nav_result {
|
||||||
|
Ok(Ok(_)) => {
|
||||||
|
// Navigation succeeded, verify we're on correct page
|
||||||
|
verify_navigation(client, expected_url_fragment, 3).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
// Navigation failed - clear state before returning error
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Navigation failed: {}. Clearing browser state...",
|
||||||
|
e
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(client).await.ok(); // Best effort
|
||||||
|
Err(anyhow!("Navigation failed: {}", e))
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Navigation timed out - clear state before returning error
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Navigation timeout after {}s. Clearing browser state...",
|
||||||
|
timeout_secs
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(client).await.ok(); // Best effort
|
||||||
|
Err(anyhow!("Navigation timeout"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_content_check_variants() {
|
||||||
|
let check1 = ContentCheck::ElementExists("table".to_string());
|
||||||
|
let check2 = ContentCheck::TextContains("Yahoo Finance".to_string());
|
||||||
|
|
||||||
|
match check1 {
|
||||||
|
ContentCheck::ElementExists(sel) => assert_eq!(sel, "table"),
|
||||||
|
_ => panic!("Wrong variant"),
|
||||||
|
}
|
||||||
|
|
||||||
|
match check2 {
|
||||||
|
ContentCheck::TextContains(text) => assert_eq!(text, "Yahoo Finance"),
|
||||||
|
_ => panic!("Wrong variant"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
// src/corporate/update.rs - ABORT-SAFE VERSION WITH JSONL LOG
|
// src/corporate/update.rs - UPDATED WITH DATA INTEGRITY FIXES
|
||||||
|
|
||||||
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
|
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
|
||||||
use crate::config::Config;
|
use crate::config::Config;
|
||||||
|
use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel;
|
||||||
use crate::util::directories::DataPaths;
|
use crate::util::directories::DataPaths;
|
||||||
use crate::util::logger;
|
use crate::util::logger;
|
||||||
use crate::scraper::webdriver::ChromeDriverPool;
|
use crate::scraper::webdriver::ChromeDriverPool;
|
||||||
@@ -11,12 +11,13 @@ use std::collections::HashMap;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
|
||||||
|
/// UPDATED: Main corporate update entry point with shutdown awareness
|
||||||
pub async fn run_full_update(
|
pub async fn run_full_update(
|
||||||
_config: &Config,
|
_config: &Config,
|
||||||
pool: &Arc<ChromeDriverPool>,
|
pool: &Arc<ChromeDriverPool>,
|
||||||
shutdown_flag: &Arc<AtomicBool>,
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
logger::log_info("=== Corporate Update (STREAMING MODE) ===").await;
|
logger::log_info("=== Corporate Update (STREAMING MODE WITH DATA INTEGRITY) ===").await;
|
||||||
|
|
||||||
let paths = DataPaths::new(".")?;
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
@@ -33,6 +34,7 @@ pub async fn run_full_update(
|
|||||||
};
|
};
|
||||||
|
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected after GLEIF download").await;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -41,6 +43,7 @@ pub async fn run_full_update(
|
|||||||
logger::log_info(" ✓ OpenFIGI metadata loaded").await;
|
logger::log_info(" ✓ OpenFIGI metadata loaded").await;
|
||||||
|
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected after OpenFIGI load").await;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -54,6 +57,7 @@ pub async fn run_full_update(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected after LEI-FIGI mapping").await;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,36 +73,45 @@ pub async fn run_full_update(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected after securities map build").await;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info("Step 5: Building companies.jsonl (streaming with abort-safe persistence)...").await;
|
logger::log_info("Step 5: Building companies.jsonl with parallel processing and validation...").await;
|
||||||
let count = build_companies_jsonl_streaming(&paths, pool, shutdown_flag).await?;
|
let count = build_companies_jsonl_streaming_parallel(&paths, pool, shutdown_flag).await?;
|
||||||
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
||||||
|
|
||||||
if !shutdown_flag.load(Ordering::SeqCst) {
|
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
logger::log_info("Step 6: Processing events (using index)...").await;
|
logger::log_info("Step 6: Processing events (using index)...").await;
|
||||||
let _event_index = build_event_index(&paths).await?;
|
let _event_index = build_event_index(&paths).await?;
|
||||||
logger::log_info(" ✓ Event index built").await;
|
logger::log_info(" ✓ Event index built").await;
|
||||||
|
} else {
|
||||||
|
logger::log_warn("Shutdown detected, skipping event index build").await;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info("✓ Corporate update complete").await;
|
logger::log_info("✓ Corporate update complete").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Abort-safe incremental JSONL persistence with atomic checkpoints
|
/// UPDATED: Serial version with validation (kept for compatibility/debugging)
|
||||||
///
|
///
|
||||||
/// Implements the data_updating_rule.md specification:
|
/// This is the non-parallel version that processes companies sequentially.
|
||||||
/// - Append-only JSONL log for all updates
|
/// Updated with same validation and shutdown checks as parallel version.
|
||||||
/// - fsync after each write batch
|
///
|
||||||
/// - Atomic checkpoints via temp file + rename
|
/// Use this for:
|
||||||
/// - Crash recovery by loading checkpoint + replaying log
|
/// - Debugging issues with specific companies
|
||||||
/// - Partial lines ignored during recovery
|
/// - Environments where parallel processing isn't desired
|
||||||
async fn build_companies_jsonl_streaming(
|
/// - Testing validation logic without concurrency complexity
|
||||||
|
async fn build_companies_jsonl_streaming_serial(
|
||||||
paths: &DataPaths,
|
paths: &DataPaths,
|
||||||
pool: &Arc<ChromeDriverPool>,
|
pool: &Arc<ChromeDriverPool>,
|
||||||
shutdown_flag: &Arc<AtomicBool>,
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
) -> anyhow::Result<usize> {
|
) -> anyhow::Result<usize> {
|
||||||
|
// Configuration constants
|
||||||
|
const CHECKPOINT_INTERVAL: usize = 50;
|
||||||
|
const FSYNC_BATCH_SIZE: usize = 10;
|
||||||
|
const FSYNC_INTERVAL_SECS: u64 = 10;
|
||||||
|
|
||||||
let path = DataPaths::new(".")?;
|
let path = DataPaths::new(".")?;
|
||||||
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
||||||
let securities_path = corporate_path.join("common_stocks.json");
|
let securities_path = corporate_path.join("common_stocks.json");
|
||||||
@@ -118,51 +131,42 @@ async fn build_companies_jsonl_streaming(
|
|||||||
tokio::fs::create_dir_all(parent).await?;
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === RECOVERY PHASE 1: Load last checkpoint ===
|
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||||
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||||
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||||
|
|
||||||
if companies_path.exists() {
|
if companies_path.exists() {
|
||||||
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
||||||
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
||||||
|
|
||||||
for line in existing_content.lines() {
|
for line in existing_content.lines() {
|
||||||
if line.trim().is_empty() {
|
if line.trim().is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Only process complete lines (ending with proper JSON closing brace)
|
|
||||||
// This ensures we don't process partial writes from crashed processes
|
|
||||||
if !line.ends_with('}') {
|
|
||||||
logger::log_warn(&format!("Skipping incomplete checkpoint line: {}", &line[..line.len().min(50)])).await;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||||
Ok(company) => {
|
Ok(company) => {
|
||||||
processed_names.insert(company.name.clone());
|
processed_names.insert(company.name.clone());
|
||||||
existing_companies.insert(company.name.clone(), company);
|
existing_companies.insert(company.name.clone(), company);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
logger::log_warn(&format!("Failed to parse checkpoint line: {}", e)).await;
|
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === RECOVERY PHASE 2: Replay log after checkpoint ===
|
|
||||||
if log_path.exists() {
|
if log_path.exists() {
|
||||||
logger::log_info("Replaying update log...").await;
|
logger::log_info("Replaying update log...").await;
|
||||||
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||||
let mut replayed = 0;
|
let mut replayed = 0;
|
||||||
|
|
||||||
for line in log_content.lines() {
|
for line in log_content.lines() {
|
||||||
if line.trim().is_empty() {
|
if line.trim().is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Only replay complete lines (crash-safe: incomplete lines are ignored)
|
|
||||||
// A line is considered complete only if it ends with '\n' and valid JSON
|
|
||||||
if !line.ends_with('}') {
|
|
||||||
logger::log_warn(&format!("Skipping incomplete log line: {}", &line[..line.len().min(50)])).await;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||||
Ok(company) => {
|
Ok(company) => {
|
||||||
processed_names.insert(company.name.clone());
|
processed_names.insert(company.name.clone());
|
||||||
@@ -170,7 +174,7 @@ async fn build_companies_jsonl_streaming(
|
|||||||
replayed += 1;
|
replayed += 1;
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
logger::log_warn(&format!("Failed to parse log line: {}", e)).await;
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -179,135 +183,67 @@ async fn build_companies_jsonl_streaming(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// === APPEND-ONLY LOG: Open in append mode with O_APPEND semantics ===
|
// === OPEN LOG FILE ===
|
||||||
use tokio::fs::OpenOptions;
|
use tokio::fs::OpenOptions;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
|
||||||
let mut log_file = OpenOptions::new()
|
let mut log_file = OpenOptions::new()
|
||||||
.create(true)
|
.create(true)
|
||||||
.append(true) // O_APPEND - atomic append operations
|
.append(true)
|
||||||
.open(&log_path)
|
.open(&log_path)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let mut count = existing_companies.len();
|
let mut writes_since_fsync = 0;
|
||||||
let mut updated_count = 0;
|
let mut last_fsync = std::time::Instant::now();
|
||||||
let mut new_count = 0;
|
|
||||||
let checkpoint_interval = 50; // Create atomic checkpoint every 50 updates
|
|
||||||
let mut updates_since_checkpoint = 0;
|
let mut updates_since_checkpoint = 0;
|
||||||
|
let mut count = 0;
|
||||||
|
let mut new_count = 0;
|
||||||
|
let mut updated_count = 0;
|
||||||
|
|
||||||
use tokio::io::AsyncWriteExt;
|
logger::log_info(&format!("Processing {} companies sequentially...", securities.len())).await;
|
||||||
|
|
||||||
for (name, company_info) in securities.iter() {
|
// === PROCESS COMPANIES SEQUENTIALLY ===
|
||||||
|
for (name, company_info) in securities.clone() {
|
||||||
|
// Check shutdown before each company
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
logger::log_info("Shutdown requested - stopping company processing").await;
|
logger::log_warn(&format!(
|
||||||
|
"Shutdown detected at company: {} (progress: {}/{})",
|
||||||
|
name, count, count + securities.len()
|
||||||
|
)).await;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip if already processed (from checkpoint or log replay)
|
let existing_entry = existing_companies.get(&name).cloned();
|
||||||
if processed_names.contains(name) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let existing_entry = existing_companies.get(name).cloned();
|
|
||||||
let is_update = existing_entry.is_some();
|
let is_update = existing_entry.is_some();
|
||||||
|
|
||||||
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
// Process company with validation
|
||||||
existing_entry
|
match process_single_company_serial(
|
||||||
.as_ref()
|
name.clone(),
|
||||||
.map(|e| e.isin_tickers_map.clone())
|
company_info,
|
||||||
.unwrap_or_default();
|
existing_entry,
|
||||||
|
pool,
|
||||||
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
|
shutdown_flag,
|
||||||
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
|
).await {
|
||||||
|
Ok(Some(company_entry)) => {
|
||||||
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
// Write to log
|
||||||
|
|
||||||
for figi_infos in company_info.securities.values() {
|
|
||||||
for figi_info in figi_infos {
|
|
||||||
if !figi_info.isin.is_empty() {
|
|
||||||
let tickers = unique_isin_ticker_pairs
|
|
||||||
.entry(figi_info.isin.clone())
|
|
||||||
.or_insert_with(Vec::new);
|
|
||||||
|
|
||||||
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
|
||||||
tickers.push(figi_info.ticker.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
let tickers = isin_tickers_map
|
|
||||||
.entry(isin.clone())
|
|
||||||
.or_insert_with(Vec::new);
|
|
||||||
|
|
||||||
for figi_ticker in figi_tickers {
|
|
||||||
if !tickers.contains(&figi_ticker) {
|
|
||||||
tickers.push(figi_ticker);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
|
||||||
|
|
||||||
if !has_yahoo_ticker && !shutdown_flag.load(Ordering::SeqCst) {
|
|
||||||
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
|
||||||
|
|
||||||
match scrape_company_details_by_isin(pool, &isin).await {
|
|
||||||
Ok(Some(details)) => {
|
|
||||||
logger::log_info(&format!("✓ Found Yahoo ticker {} for ISIN {}", details.ticker, isin)).await;
|
|
||||||
|
|
||||||
tickers.push(format!("YAHOO:{}", details.ticker));
|
|
||||||
|
|
||||||
if sector.is_none() && details.sector.is_some() {
|
|
||||||
sector = details.sector.clone();
|
|
||||||
logger::log_info(&format!(" Sector: {}", details.sector.as_ref().unwrap())).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
if exchange.is_none() && details.exchange.is_some() {
|
|
||||||
exchange = details.exchange.clone();
|
|
||||||
logger::log_info(&format!(" Exchange: {}", details.exchange.as_ref().unwrap())).await;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Ok(None) => {
|
|
||||||
logger::log_warn(&format!("◯ No search results for ISIN {}", isin)).await;
|
|
||||||
tickers.push("YAHOO:NO_RESULTS".to_string());
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
logger::log_warn(&format!("✗ Yahoo lookup error for ISIN {}: {}", isin, e)).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if shutdown_flag.load(Ordering::SeqCst) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if !isin_tickers_map.is_empty() {
|
|
||||||
let company_entry = CompanyCrossPlatformInfo {
|
|
||||||
name: name.clone(),
|
|
||||||
isin_tickers_map,
|
|
||||||
sector,
|
|
||||||
exchange,
|
|
||||||
};
|
|
||||||
|
|
||||||
// === APPEND-ONLY: Write single-line JSON with fsync ===
|
|
||||||
// This guarantees the line is either fully written or not at all
|
|
||||||
let line = serde_json::to_string(&company_entry)?;
|
let line = serde_json::to_string(&company_entry)?;
|
||||||
log_file.write_all(line.as_bytes()).await?;
|
log_file.write_all(line.as_bytes()).await?;
|
||||||
log_file.write_all(b"\n").await?;
|
log_file.write_all(b"\n").await?;
|
||||||
|
|
||||||
|
writes_since_fsync += 1;
|
||||||
|
|
||||||
|
// Batched + time-based fsync
|
||||||
|
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|
||||||
|
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
|
||||||
|
|
||||||
|
if should_fsync {
|
||||||
log_file.flush().await?;
|
log_file.flush().await?;
|
||||||
|
|
||||||
// Critical: fsync to ensure durability before considering write successful
|
|
||||||
// This prevents data loss on power failure or kernel panic
|
|
||||||
log_file.sync_data().await?;
|
log_file.sync_data().await?;
|
||||||
|
writes_since_fsync = 0;
|
||||||
|
last_fsync = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
// Update in-memory state ONLY after successful fsync
|
// Update in-memory state
|
||||||
processed_names.insert(name.clone());
|
processed_names.insert(name.clone());
|
||||||
existing_companies.insert(name.clone(), company_entry);
|
existing_companies.insert(name.clone(), company_entry);
|
||||||
|
|
||||||
@@ -320,15 +256,20 @@ async fn build_companies_jsonl_streaming(
|
|||||||
new_count += 1;
|
new_count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === ATOMIC CHECKPOINT: Periodically create checkpoint ===
|
// Periodic checkpoint
|
||||||
// This reduces recovery time by snapshotting current state
|
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
|
||||||
if updates_since_checkpoint >= checkpoint_interval {
|
if writes_since_fsync > 0 {
|
||||||
|
log_file.flush().await?;
|
||||||
|
log_file.sync_data().await?;
|
||||||
|
writes_since_fsync = 0;
|
||||||
|
last_fsync = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
logger::log_info(&format!("Creating checkpoint at {} companies...", count)).await;
|
logger::log_info(&format!("Creating checkpoint at {} companies...", count)).await;
|
||||||
|
|
||||||
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
||||||
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
||||||
|
|
||||||
// Write all current state to temporary checkpoint file
|
|
||||||
for company in existing_companies.values() {
|
for company in existing_companies.values() {
|
||||||
let line = serde_json::to_string(company)?;
|
let line = serde_json::to_string(company)?;
|
||||||
checkpoint_file.write_all(line.as_bytes()).await?;
|
checkpoint_file.write_all(line.as_bytes()).await?;
|
||||||
@@ -339,12 +280,8 @@ async fn build_companies_jsonl_streaming(
|
|||||||
checkpoint_file.sync_all().await?;
|
checkpoint_file.sync_all().await?;
|
||||||
drop(checkpoint_file);
|
drop(checkpoint_file);
|
||||||
|
|
||||||
// Atomic rename - this is the commit point
|
|
||||||
// After this succeeds, the checkpoint is visible
|
|
||||||
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
||||||
|
|
||||||
// Clear log after successful checkpoint
|
|
||||||
// Any entries before this point are now captured in the checkpoint
|
|
||||||
drop(log_file);
|
drop(log_file);
|
||||||
tokio::fs::remove_file(&log_path).await.ok();
|
tokio::fs::remove_file(&log_path).await.ok();
|
||||||
log_file = OpenOptions::new()
|
log_file = OpenOptions::new()
|
||||||
@@ -358,14 +295,39 @@ async fn build_companies_jsonl_streaming(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if count % 10 == 0 {
|
if count % 10 == 0 {
|
||||||
logger::log_info(&format!("Progress: {} companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
logger::log_info(&format!(
|
||||||
tokio::task::yield_now().await;
|
"Progress: {} companies ({} new, {} updated)",
|
||||||
|
count, new_count, updated_count
|
||||||
|
)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
// Company had no ISINs or was skipped
|
||||||
|
logger::log_info(&format!("Skipped company: {} (no ISINs)", name)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Error processing company {}: {}", name, e)).await;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// === FINAL CHECKPOINT: Write complete final state ===
|
// Time-based fsync
|
||||||
// This ensures we don't need to replay the log on next startup
|
if writes_since_fsync > 0 && last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS {
|
||||||
|
log_file.flush().await?;
|
||||||
|
log_file.sync_data().await?;
|
||||||
|
writes_since_fsync = 0;
|
||||||
|
last_fsync = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// === FSYNC PENDING WRITES ===
|
||||||
|
if writes_since_fsync > 0 {
|
||||||
|
logger::log_info(&format!("Fsyncing {} pending writes...", writes_since_fsync)).await;
|
||||||
|
log_file.flush().await?;
|
||||||
|
log_file.sync_data().await?;
|
||||||
|
logger::log_info("✓ Pending writes saved").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// === FINAL CHECKPOINT ===
|
||||||
if !shutdown_flag.load(Ordering::SeqCst) && updates_since_checkpoint > 0 {
|
if !shutdown_flag.load(Ordering::SeqCst) && updates_since_checkpoint > 0 {
|
||||||
logger::log_info("Creating final checkpoint...").await;
|
logger::log_info("Creating final checkpoint...").await;
|
||||||
|
|
||||||
@@ -382,21 +344,172 @@ async fn build_companies_jsonl_streaming(
|
|||||||
checkpoint_file.sync_all().await?;
|
checkpoint_file.sync_all().await?;
|
||||||
drop(checkpoint_file);
|
drop(checkpoint_file);
|
||||||
|
|
||||||
// Atomic rename makes final checkpoint visible
|
|
||||||
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
||||||
|
|
||||||
// Clean up log
|
|
||||||
drop(log_file);
|
drop(log_file);
|
||||||
tokio::fs::remove_file(&log_path).await.ok();
|
tokio::fs::remove_file(&log_path).await.ok();
|
||||||
|
|
||||||
logger::log_info("✓ Final checkpoint created").await;
|
logger::log_info("✓ Final checkpoint created").await;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info(&format!("Completed: {} total companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
logger::log_info(&format!(
|
||||||
|
"Completed: {} total companies ({} new, {} updated)",
|
||||||
|
count, new_count, updated_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
Ok(count)
|
Ok(count)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// UPDATED: Process single company serially with validation
|
||||||
|
async fn process_single_company_serial(
|
||||||
|
name: String,
|
||||||
|
company_info: CompanyInfo,
|
||||||
|
existing_entry: Option<CompanyCrossPlatformInfo>,
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<Option<CompanyCrossPlatformInfo>> {
|
||||||
|
// Check shutdown at start
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
||||||
|
existing_entry
|
||||||
|
.as_ref()
|
||||||
|
.map(|e| e.isin_tickers_map.clone())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
|
||||||
|
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
|
||||||
|
|
||||||
|
// Collect unique ISIN-ticker pairs
|
||||||
|
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
||||||
|
|
||||||
|
for figi_infos in company_info.securities.values() {
|
||||||
|
for figi_info in figi_infos {
|
||||||
|
if !figi_info.isin.is_empty() {
|
||||||
|
let tickers = unique_isin_ticker_pairs
|
||||||
|
.entry(figi_info.isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
||||||
|
tickers.push(figi_info.ticker.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process each ISIN with validation
|
||||||
|
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
||||||
|
// Check shutdown before each ISIN
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let tickers = isin_tickers_map
|
||||||
|
.entry(isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
for figi_ticker in figi_tickers {
|
||||||
|
if !tickers.contains(&figi_ticker) {
|
||||||
|
tickers.push(figi_ticker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
||||||
|
|
||||||
|
if !has_yahoo_ticker {
|
||||||
|
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
||||||
|
|
||||||
|
// Use validated scraping with retry
|
||||||
|
match scrape_with_retry_serial(pool, &isin, 3, shutdown_flag).await {
|
||||||
|
Ok(Some(details)) => {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
|
||||||
|
details.ticker, isin, name
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
tickers.push(format!("YAHOO:{}", details.ticker));
|
||||||
|
|
||||||
|
if sector.is_none() && details.sector.is_some() {
|
||||||
|
sector = details.sector.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
if exchange.is_none() && details.exchange.is_some() {
|
||||||
|
exchange = details.exchange.clone();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Ok(None) => {
|
||||||
|
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
|
||||||
|
tickers.push("YAHOO:NO_RESULTS".to_string());
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
|
||||||
|
isin, name, e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final shutdown check
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isin_tickers_map.is_empty() {
|
||||||
|
Ok(Some(CompanyCrossPlatformInfo {
|
||||||
|
name,
|
||||||
|
isin_tickers_map,
|
||||||
|
sector,
|
||||||
|
exchange,
|
||||||
|
}))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// UPDATED: Scrape with retry for serial processing
|
||||||
|
async fn scrape_with_retry_serial(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
isin: &str,
|
||||||
|
max_retries: u32,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
||||||
|
let mut retries = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow::anyhow!("Aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
|
||||||
|
Ok(result) => return Ok(result),
|
||||||
|
Err(e) => {
|
||||||
|
if retries >= max_retries {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let backoff_ms = 1000 * 2u64.pow(retries);
|
||||||
|
let jitter_ms = random_range(0, 500);
|
||||||
|
let total_delay = backoff_ms + jitter_ms;
|
||||||
|
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Retry {}/{} for ISIN {} after {}ms: {}",
|
||||||
|
retries + 1, max_retries, isin, total_delay, e
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(total_delay)).await;
|
||||||
|
retries += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
||||||
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
||||||
|
|
||||||
|
|||||||
578
src/corporate/update_parallel.rs
Normal file
578
src/corporate/update_parallel.rs
Normal file
@@ -0,0 +1,578 @@
|
|||||||
|
// src/corporate/update_parallel.rs - UPDATED WITH DATA INTEGRITY FIXES
|
||||||
|
// PARALLELIZED VERSION with atomic commits and validation
|
||||||
|
//
|
||||||
|
// Key improvements over original:
|
||||||
|
// - Page validation to prevent stale content extraction
|
||||||
|
// - Shutdown-aware task processing
|
||||||
|
// - Better error recovery with browser state cleanup
|
||||||
|
// - All original fsync and checkpoint logic preserved
|
||||||
|
|
||||||
|
use super::{types::*, yahoo::*, helpers::*};
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
|
use crate::scraper::webdriver::ChromeDriverPool;
|
||||||
|
|
||||||
|
use rand::Rng;
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use tokio::fs::OpenOptions;
|
||||||
|
use tokio::time::sleep;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::time::Duration;
|
||||||
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
|
||||||
|
/// Represents a write command to be serialized through the log writer
|
||||||
|
enum LogCommand {
|
||||||
|
Write(CompanyCrossPlatformInfo),
|
||||||
|
Checkpoint,
|
||||||
|
Shutdown,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result from processing a single company
|
||||||
|
struct CompanyProcessResult {
|
||||||
|
company: CompanyCrossPlatformInfo,
|
||||||
|
is_update: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// UPDATED: Abort-safe incremental JSONL persistence with validation
|
||||||
|
///
|
||||||
|
/// New safety features:
|
||||||
|
/// - Page validation before extraction
|
||||||
|
/// - Shutdown checks at all critical points
|
||||||
|
/// - Browser state cleanup on errors
|
||||||
|
/// - All writes still atomic with fsync
|
||||||
|
pub async fn build_companies_jsonl_streaming_parallel(
|
||||||
|
paths: &DataPaths,
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
|
// Configuration constants
|
||||||
|
const CHECKPOINT_INTERVAL: usize = 50;
|
||||||
|
const FSYNC_BATCH_SIZE: usize = 10;
|
||||||
|
const FSYNC_INTERVAL_SECS: u64 = 10;
|
||||||
|
const CONCURRENCY_LIMIT: usize = 100;
|
||||||
|
|
||||||
|
let path = DataPaths::new(".")?;
|
||||||
|
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
||||||
|
let securities_path = corporate_path.join("common_stocks.json");
|
||||||
|
|
||||||
|
if !securities_path.exists() {
|
||||||
|
logger::log_warn("No common_stocks.json found").await;
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = tokio::fs::read_to_string(securities_path).await?;
|
||||||
|
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
let companies_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
let log_path = paths.data_dir().join("companies_updates.log");
|
||||||
|
|
||||||
|
if let Some(parent) = companies_path.parent() {
|
||||||
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||||
|
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||||
|
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||||
|
|
||||||
|
if companies_path.exists() {
|
||||||
|
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
||||||
|
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
||||||
|
|
||||||
|
for line in existing_content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
processed_names.insert(company.name.clone());
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
if log_path.exists() {
|
||||||
|
logger::log_info("Replaying update log...").await;
|
||||||
|
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||||
|
let mut replayed = 0;
|
||||||
|
|
||||||
|
for line in log_content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
processed_names.insert(company.name.clone());
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
replayed += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if replayed > 0 {
|
||||||
|
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// === SETUP LOG WRITER TASK ===
|
||||||
|
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
|
||||||
|
|
||||||
|
let log_file_init = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let companies_path_clone = companies_path.clone();
|
||||||
|
let log_path_clone = log_path.clone();
|
||||||
|
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
|
||||||
|
|
||||||
|
let write_tx_for_writer = write_tx.clone();
|
||||||
|
|
||||||
|
let writer_task = tokio::spawn(async move {
|
||||||
|
let mut log_file = log_file_init;
|
||||||
|
let mut writes_since_fsync = 0;
|
||||||
|
let mut last_fsync = std::time::Instant::now();
|
||||||
|
let mut updates_since_checkpoint = 0;
|
||||||
|
let mut count = 0;
|
||||||
|
let mut new_count = 0;
|
||||||
|
let mut updated_count = 0;
|
||||||
|
|
||||||
|
while let Some(cmd) = write_rx.recv().await {
|
||||||
|
match cmd {
|
||||||
|
LogCommand::Write(company) => {
|
||||||
|
// Write to log
|
||||||
|
let line = serde_json::to_string(&company).unwrap();
|
||||||
|
if let Err(e) = log_file.write_all(line.as_bytes()).await {
|
||||||
|
logger::log_error(&format!("Failed to write to log: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.write_all(b"\n").await {
|
||||||
|
logger::log_error(&format!("Failed to write newline: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
writes_since_fsync += 1;
|
||||||
|
updates_since_checkpoint += 1;
|
||||||
|
count += 1;
|
||||||
|
|
||||||
|
// Update in-memory state
|
||||||
|
let mut existing_companies = existing_companies_writer.lock().await;
|
||||||
|
let is_update = existing_companies.contains_key(&company.name);
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
drop(existing_companies);
|
||||||
|
|
||||||
|
if is_update {
|
||||||
|
updated_count += 1;
|
||||||
|
} else {
|
||||||
|
new_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Batched + time-based fsync
|
||||||
|
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|
||||||
|
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
|
||||||
|
|
||||||
|
if should_fsync {
|
||||||
|
if let Err(e) = log_file.flush().await {
|
||||||
|
logger::log_error(&format!("Failed to flush: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.sync_data().await {
|
||||||
|
logger::log_error(&format!("Failed to fsync: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writes_since_fsync = 0;
|
||||||
|
last_fsync = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LogCommand::Checkpoint => {
|
||||||
|
if let Err(e) = log_file.flush().await {
|
||||||
|
logger::log_error(&format!("Failed to flush before checkpoint: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.sync_data().await {
|
||||||
|
logger::log_error(&format!("Failed to fsync before checkpoint: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let existing_companies = existing_companies_writer.lock().await;
|
||||||
|
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
|
||||||
|
drop(existing_companies);
|
||||||
|
|
||||||
|
let temp_path = companies_path_clone.with_extension("tmp");
|
||||||
|
match tokio::fs::File::create(&temp_path).await {
|
||||||
|
Ok(mut temp_file) => {
|
||||||
|
let mut checkpoint_ok = true;
|
||||||
|
for company in &companies_vec {
|
||||||
|
if let Ok(line) = serde_json::to_string(company) {
|
||||||
|
if temp_file.write_all(line.as_bytes()).await.is_err() ||
|
||||||
|
temp_file.write_all(b"\n").await.is_err() {
|
||||||
|
checkpoint_ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if checkpoint_ok {
|
||||||
|
if temp_file.flush().await.is_ok() &&
|
||||||
|
temp_file.sync_data().await.is_ok() {
|
||||||
|
drop(temp_file);
|
||||||
|
|
||||||
|
if tokio::fs::rename(&temp_path, &companies_path_clone).await.is_ok() {
|
||||||
|
if tokio::fs::remove_file(&log_path_clone).await.is_ok() {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Checkpoint created ({} companies), log cleared",
|
||||||
|
companies_vec.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
if let Ok(new_log) = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path_clone)
|
||||||
|
.await {
|
||||||
|
log_file = new_log;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("Failed to create checkpoint temp file: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
updates_since_checkpoint = 0;
|
||||||
|
}
|
||||||
|
LogCommand::Shutdown => {
|
||||||
|
logger::log_info("Writer shutting down...").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Periodic checkpoint trigger
|
||||||
|
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
|
||||||
|
let _ = write_tx.send(LogCommand::Checkpoint).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final fsync
|
||||||
|
let _ = log_file.flush().await;
|
||||||
|
let _ = log_file.sync_data().await;
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Writer finished: {} total ({} new, {} updated)",
|
||||||
|
count, new_count, updated_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
(count, new_count, updated_count)
|
||||||
|
});
|
||||||
|
|
||||||
|
// === PARALLEL PROCESSING PHASE ===
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Starting parallel processing of {} companies (concurrency limit: {})",
|
||||||
|
securities.len(),
|
||||||
|
CONCURRENCY_LIMIT
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
let mut processing_tasks = FuturesUnordered::new();
|
||||||
|
let mut processed = 0;
|
||||||
|
let total = securities.len();
|
||||||
|
|
||||||
|
for (name, company_info) in securities.into_iter() {
|
||||||
|
// Check shutdown before creating new tasks
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected, stopping task creation").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait if we hit concurrency limit
|
||||||
|
while processing_tasks.len() >= CONCURRENCY_LIMIT {
|
||||||
|
if let Some(result) = processing_tasks.next().await {
|
||||||
|
match result {
|
||||||
|
Ok(Ok(Some(company_result))) => {
|
||||||
|
let company_result: CompanyProcessResult = company_result;
|
||||||
|
let _ = write_tx_for_writer.send(LogCommand::Write(company_result.company)).await?;
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
Ok(Ok(None)) => {
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
logger::log_warn(&format!("Company processing error: {}", e)).await;
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("Task panic: {}", e)).await;
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spawn new task
|
||||||
|
let pool = pool.clone();
|
||||||
|
let shutdown_flag = shutdown_flag.clone();
|
||||||
|
let existing_entry = existing_companies.get(&name).cloned();
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
process_single_company_validated(
|
||||||
|
name,
|
||||||
|
company_info,
|
||||||
|
existing_entry,
|
||||||
|
&pool,
|
||||||
|
&shutdown_flag
|
||||||
|
).await
|
||||||
|
});
|
||||||
|
|
||||||
|
processing_tasks.push(task);
|
||||||
|
|
||||||
|
if processed % 10 == 0 && processed > 0 {
|
||||||
|
logger::log_info(&format!("Progress: {}/{} companies processed", processed, total)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for remaining tasks
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Waiting for {} remaining tasks to complete...",
|
||||||
|
processing_tasks.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
while let Some(result) = processing_tasks.next().await {
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected during final task wait").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(Ok(Some(company_result))) => {
|
||||||
|
if write_tx_for_writer.send(LogCommand::Write(company_result.company)).await.is_err() {
|
||||||
|
logger::log_error("Writer task died").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
Ok(Ok(None)) => {
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
logger::log_warn(&format!("Company processing error: {}", e)).await;
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("Task panic: {}", e)).await;
|
||||||
|
processed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Signal writer to finish
|
||||||
|
let _ = write_tx_for_writer.send(LogCommand::Checkpoint).await;
|
||||||
|
let _ = write_tx_for_writer.send(LogCommand::Shutdown).await;
|
||||||
|
drop(write_tx_for_writer);
|
||||||
|
|
||||||
|
// Wait for writer to finish
|
||||||
|
let (final_count, final_new, final_updated) = writer_task.await
|
||||||
|
.unwrap_or((0, 0, 0));
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Completed: {} total companies ({} new, {} updated)",
|
||||||
|
final_count, final_new, final_updated
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
Ok(final_count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scrape with retry, validation, and shutdown awareness
|
||||||
|
async fn scrape_with_retry(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
isin: &str,
|
||||||
|
max_retries: u32,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> Result<Option<YahooCompanyDetails>> {
|
||||||
|
let mut retries = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
// Check shutdown before each attempt
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
|
||||||
|
Ok(result) => return Ok(result),
|
||||||
|
Err(e) => {
|
||||||
|
if retries >= max_retries {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"All {} retries exhausted for ISIN {}: {}",
|
||||||
|
max_retries, isin, e
|
||||||
|
)).await;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let backoff_ms = 1000 * 2u64.pow(retries);
|
||||||
|
let jitter_ms = random_range(0, 500);
|
||||||
|
let total_delay = backoff_ms + jitter_ms;
|
||||||
|
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Retry {}/{} for ISIN {} after {}ms: {}",
|
||||||
|
retries + 1, max_retries, isin, total_delay, e
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(total_delay)).await;
|
||||||
|
retries += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// UPDATED: Process single company with validation and shutdown checks
|
||||||
|
async fn process_single_company_validated(
|
||||||
|
name: String,
|
||||||
|
company_info: CompanyInfo,
|
||||||
|
existing_entry: Option<CompanyCrossPlatformInfo>,
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<Option<CompanyProcessResult>> {
|
||||||
|
// Check shutdown at start
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!("Shutdown detected, skipping company: {}", name)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let is_update = existing_entry.is_some();
|
||||||
|
|
||||||
|
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
||||||
|
existing_entry
|
||||||
|
.as_ref()
|
||||||
|
.map(|e| e.isin_tickers_map.clone())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
|
||||||
|
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
|
||||||
|
|
||||||
|
// Collect unique ISIN-ticker pairs
|
||||||
|
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
||||||
|
|
||||||
|
for figi_infos in company_info.securities.values() {
|
||||||
|
for figi_info in figi_infos {
|
||||||
|
if !figi_info.isin.is_empty() {
|
||||||
|
let tickers = unique_isin_ticker_pairs
|
||||||
|
.entry(figi_info.isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
||||||
|
tickers.push(figi_info.ticker.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process each ISIN with validation
|
||||||
|
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
||||||
|
// Check shutdown before each ISIN
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Shutdown detected while processing company: {}",
|
||||||
|
name
|
||||||
|
)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let tickers = isin_tickers_map
|
||||||
|
.entry(isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
for figi_ticker in figi_tickers {
|
||||||
|
if !tickers.contains(&figi_ticker) {
|
||||||
|
tickers.push(figi_ticker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
||||||
|
|
||||||
|
if !has_yahoo_ticker {
|
||||||
|
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
||||||
|
|
||||||
|
match scrape_with_retry(pool, &isin, 3, shutdown_flag).await {
|
||||||
|
Ok(Some(details)) => {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
|
||||||
|
details.ticker, isin, name
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
tickers.push(format!("YAHOO:{}", details.ticker));
|
||||||
|
|
||||||
|
if sector.is_none() && details.sector.is_some() {
|
||||||
|
sector = details.sector.clone();
|
||||||
|
logger::log_info(&format!(" Sector: {}", details.sector.as_ref().unwrap())).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
if exchange.is_none() && details.exchange.is_some() {
|
||||||
|
exchange = details.exchange.clone();
|
||||||
|
logger::log_info(&format!(" Exchange: {}", details.exchange.as_ref().unwrap())).await;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Ok(None) => {
|
||||||
|
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
|
||||||
|
tickers.push("YAHOO:NO_RESULTS".to_string());
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!("Shutdown during scrape for ISIN {}", isin)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
|
||||||
|
isin, name, e
|
||||||
|
)).await;
|
||||||
|
// Continue with next ISIN
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final shutdown check before returning result
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Shutdown detected, discarding incomplete result for: {}",
|
||||||
|
name
|
||||||
|
)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isin_tickers_map.is_empty() {
|
||||||
|
let company_entry = CompanyCrossPlatformInfo {
|
||||||
|
name: name.clone(),
|
||||||
|
isin_tickers_map,
|
||||||
|
sector,
|
||||||
|
exchange,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Some(CompanyProcessResult {
|
||||||
|
company: company_entry,
|
||||||
|
is_update,
|
||||||
|
}))
|
||||||
|
} else {
|
||||||
|
logger::log_warn(&format!("No ISINs found for company: {}", name)).await;
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,17 +1,15 @@
|
|||||||
// src/corporate/yahoo.rs
|
// src/corporate/yahoo.rs - UPDATED WITH DATA INTEGRITY FIXES
|
||||||
use super::{types::*, helpers::*};
|
use super::{types::*, helpers::*, page_validation::*};
|
||||||
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
||||||
use event_backtest_engine::logger;
|
use crate::logger;
|
||||||
use fantoccini::{Client, Locator};
|
use fantoccini::{Client, Locator};
|
||||||
|
use rand::Rng;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::{time::{Duration as TokioDuration, sleep}};
|
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
||||||
use std::{sync::Arc};
|
use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}};
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
|
|
||||||
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
|
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
|
||||||
/// Mapping existing
|
|
||||||
|
|
||||||
/// getting historical stock price data daily (xxxx - 2025) and hourly (last 30 days)
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub enum YahooTickerResult {
|
pub enum YahooTickerResult {
|
||||||
@@ -21,6 +19,16 @@ pub enum YahooTickerResult {
|
|||||||
AmbiguousResults,
|
AmbiguousResults,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ExtractionMetadata {
|
||||||
|
#[serde(rename = "selectedRowIndex")]
|
||||||
|
pub selected_row_index: usize,
|
||||||
|
#[serde(rename = "validFieldCount")]
|
||||||
|
pub valid_field_count: usize,
|
||||||
|
#[serde(rename = "totalRows")]
|
||||||
|
pub total_rows: usize,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
pub struct ExtractionResult {
|
pub struct ExtractionResult {
|
||||||
status: String,
|
status: String,
|
||||||
@@ -29,6 +37,8 @@ pub struct ExtractionResult {
|
|||||||
exchange: Option<String>,
|
exchange: Option<String>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
error_message: Option<String>,
|
error_message: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
metadata: Option<ExtractionMetadata>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl YahooTickerResult {
|
impl YahooTickerResult {
|
||||||
@@ -53,47 +63,227 @@ impl YahooTickerResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// UPDATED: Scrape company details with full validation and shutdown support
|
||||||
pub async fn scrape_company_details_by_isin(
|
pub async fn scrape_company_details_by_isin(
|
||||||
pool: &Arc<ChromeDriverPool>,
|
pool: &Arc<ChromeDriverPool>,
|
||||||
isin: &str,
|
isin: &str,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
||||||
let isin = isin.to_string();
|
// Check shutdown before starting
|
||||||
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
let isin = isin.clone();
|
logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let isin_owned = isin.to_string();
|
||||||
|
let shutdown_clone = Arc::clone(shutdown_flag);
|
||||||
|
let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
|
||||||
|
|
||||||
|
pool.execute(url.clone(), move |client| {
|
||||||
|
let isin = isin_owned.clone();
|
||||||
|
let shutdown = shutdown_clone.clone();
|
||||||
|
|
||||||
Box::pin(async move {
|
Box::pin(async move {
|
||||||
sleep(TokioDuration::from_millis(1000)).await;
|
// Check shutdown during task execution
|
||||||
|
if shutdown.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Task aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Random delay
|
||||||
|
let delay = rand::rng().random_range(800..1500);
|
||||||
|
sleep(TokioDuration::from_millis(delay)).await;
|
||||||
|
|
||||||
|
// Reject cookies
|
||||||
reject_yahoo_cookies(&client).await?;
|
reject_yahoo_cookies(&client).await?;
|
||||||
sleep(TokioDuration::from_millis(1000)).await;
|
|
||||||
extract_company_details(&client, &isin).await
|
// Check shutdown again
|
||||||
|
if shutdown.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Task aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// CRITICAL: Validate navigation succeeded
|
||||||
|
let expected_fragment = format!("lookup/?s={}", isin);
|
||||||
|
match verify_navigation(&client, &expected_fragment, 5).await {
|
||||||
|
Ok(_) => {
|
||||||
|
logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"Navigation verification failed for ISIN {}: {}",
|
||||||
|
isin, e
|
||||||
|
)).await;
|
||||||
|
// Clear browser state before returning error
|
||||||
|
clear_browser_state(&client).await.ok();
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Additional content validation
|
||||||
|
let page_ready: bool = client
|
||||||
|
.execute(
|
||||||
|
r#"
|
||||||
|
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
|
||||||
|
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
|
||||||
|
return !!(table || noData);
|
||||||
|
"#,
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if !page_ready {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"Page content not ready for ISIN {} - neither table nor no-data element found",
|
||||||
|
isin
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(&client).await.ok();
|
||||||
|
return Err(anyhow!("Page content not ready"));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await;
|
||||||
|
|
||||||
|
// Check shutdown before extraction
|
||||||
|
if shutdown.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Task aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Random delay before extraction
|
||||||
|
let delay = rand::rng().random_range(800..1500);
|
||||||
|
sleep(TokioDuration::from_millis(delay)).await;
|
||||||
|
|
||||||
|
// Now safe to extract
|
||||||
|
extract_company_details_validated(&client, &isin).await
|
||||||
})
|
})
|
||||||
}).await
|
}).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// UPDATED: Extract with additional URL validation
|
||||||
|
async fn extract_company_details_validated(
|
||||||
|
client: &Client,
|
||||||
|
isin: &str,
|
||||||
|
) -> Result<Option<YahooCompanyDetails>> {
|
||||||
|
// Double-check URL is still correct before extraction
|
||||||
|
let current_url = client.current_url().await?;
|
||||||
|
if !current_url.as_str().contains(isin) {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'",
|
||||||
|
isin,
|
||||||
|
current_url.as_str()
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(client).await.ok();
|
||||||
|
return Err(anyhow!("URL mismatch - possible stale page"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run extraction
|
||||||
|
let result = extract_company_details(client, isin).await?;
|
||||||
|
|
||||||
|
// Validate extraction result
|
||||||
|
if let Some(ref details) = result {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})",
|
||||||
|
details.ticker, isin, details.sector, details.exchange
|
||||||
|
)).await;
|
||||||
|
} else {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"No ticker found for ISIN {} (legitimately not found)",
|
||||||
|
isin
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn extract_company_details(
|
pub async fn extract_company_details(
|
||||||
client: &Client,
|
client: &Client,
|
||||||
_isin: &str,
|
_isin: &str,
|
||||||
) -> Result<Option<YahooCompanyDetails>> {
|
) -> Result<Option<YahooCompanyDetails>> {
|
||||||
|
// Wait for page to load - look for either the table or the no-data element
|
||||||
|
let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
|
||||||
|
TokioDuration::from_secs(30),
|
||||||
|
async {
|
||||||
|
for _ in 0..60 {
|
||||||
|
let has_content: bool = client
|
||||||
|
.execute(
|
||||||
|
r#"
|
||||||
|
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
|
||||||
|
const noData = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
|
||||||
|
return !!(table || noData);
|
||||||
|
"#,
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Execute error: {}", e))?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if has_content {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(TokioDuration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
Ok(false)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
||||||
|
|
||||||
|
match wait_result {
|
||||||
|
Err(_) => {
|
||||||
|
return Err(anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
||||||
|
},
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
return Err(anyhow!("Error checking page content: {}", e));
|
||||||
|
},
|
||||||
|
Ok(Ok(false)) => {
|
||||||
|
logger::log_warn("Page content not found after waiting, attempting extraction anyway").await;
|
||||||
|
},
|
||||||
|
Ok(Ok(true)) => {
|
||||||
|
logger::log_info("Page content detected, proceeding with extraction").await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Execute the JavaScript extraction script
|
// Execute the JavaScript extraction script
|
||||||
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
|
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
|
||||||
|
|
||||||
|
// Log the raw result for debugging
|
||||||
|
logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await;
|
||||||
|
|
||||||
|
// Check if result is null
|
||||||
|
if result.is_null() {
|
||||||
|
return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed"));
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the JSON result
|
// Parse the JSON result
|
||||||
let extraction: ExtractionResult = serde_json::from_value(result)
|
let extraction: ExtractionResult = serde_json::from_value(result.clone())
|
||||||
.map_err(|e| anyhow!("Failed to parse extraction result: {}", e))?;
|
.map_err(|e| {
|
||||||
|
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
|
||||||
|
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
|
||||||
|
})?;
|
||||||
|
|
||||||
match extraction.status.as_str() {
|
match extraction.status.as_str() {
|
||||||
"found" => {
|
"found" => {
|
||||||
if let Some(ticker) = extraction.ticker {
|
if let Some(ticker) = extraction.ticker {
|
||||||
|
if let Some(ref metadata) = extraction.metadata {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Selected row {} with {} valid fields out of {} total rows",
|
||||||
|
metadata.selected_row_index,
|
||||||
|
metadata.valid_field_count,
|
||||||
|
metadata.total_rows
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(Some(YahooCompanyDetails {
|
Ok(Some(YahooCompanyDetails {
|
||||||
ticker,
|
ticker,
|
||||||
sector: extraction.sector,
|
sector: extraction.sector,
|
||||||
exchange: extraction.exchange,
|
exchange: extraction.exchange,
|
||||||
}))
|
}))
|
||||||
} else {
|
} else {
|
||||||
Ok(None)
|
Err(anyhow!("Status 'found' but no ticker present"))
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"no_results" => Ok(None),
|
"no_results" => Ok(None),
|
||||||
"not_found" => Ok(None),
|
|
||||||
"error" => {
|
"error" => {
|
||||||
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
|
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
|
||||||
Err(anyhow!("JavaScript extraction error: {}", error_msg))
|
Err(anyhow!("JavaScript extraction error: {}", error_msg))
|
||||||
@@ -116,19 +306,6 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
|
|||||||
Ok(tickers)
|
Ok(tickers)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
|
||||||
///
|
|
||||||
/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
|
|
||||||
/// reject cookies, and extract the events.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `ticker` - The stock ticker symbol.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of CompanyEvent structs on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
|
|
||||||
pub async fn fetch_earnings_with_pool(
|
pub async fn fetch_earnings_with_pool(
|
||||||
pool: &Arc<ChromeDriverPool>,
|
pool: &Arc<ChromeDriverPool>,
|
||||||
ticker: &str,
|
ticker: &str,
|
||||||
@@ -147,40 +324,6 @@ pub async fn fetch_earnings_with_pool(
|
|||||||
}).await
|
}).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
|
|
||||||
///
|
|
||||||
/// This function assumes the client is already navigated to the correct URL (e.g.,
|
|
||||||
/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
|
|
||||||
///
|
|
||||||
/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
|
|
||||||
/// and handles date parsing, float parsing, and optional fields.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `client` - The fantoccini Client with the page loaded.
|
|
||||||
/// * `ticker` - The stock ticker symbol for the events.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of CompanyEvent on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if:
|
|
||||||
/// - Table or elements not found.
|
|
||||||
/// - Date or float parsing fails.
|
|
||||||
/// - WebDriver operations fail.
|
|
||||||
///
|
|
||||||
/// # Examples
|
|
||||||
///
|
|
||||||
/// ```no_run
|
|
||||||
/// use fantoccini::Client;
|
|
||||||
/// use crate::corporate::scraper::extract_earnings;
|
|
||||||
///
|
|
||||||
/// #[tokio::main]
|
|
||||||
/// async fn main() -> Result<()> {
|
|
||||||
/// // Assume client is set up and navigated
|
|
||||||
/// let events = extract_earnings(&client, "AAPL").await?;
|
|
||||||
/// Ok(())
|
|
||||||
/// }
|
|
||||||
/// ```
|
|
||||||
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
|
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
|
||||||
// Wait for the table to load
|
// Wait for the table to load
|
||||||
let table = client
|
let table = client
|
||||||
|
|||||||
@@ -1,61 +1,214 @@
|
|||||||
// yahoo_company_extraction.js
|
// yahoo_company_extraction.js
|
||||||
// JavaScript extraction script for Yahoo Finance company details
|
// JavaScript extraction script for Yahoo Finance company details
|
||||||
// Used to extract ticker, sector, and exchange from Yahoo Finance search results
|
// Used to extract ticker, sector, and exchange from Yahoo Finance search results
|
||||||
|
// Only ticker is mandatory - sector and exchange are optional fields
|
||||||
|
|
||||||
(function() {
|
// Example selectors:
|
||||||
|
// with results:
|
||||||
|
// document.querySelector("#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(1) > span > div > a")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(2) > span > div")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(3) > span > div")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(4) > span > div > a")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(5) > span > div")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(6) > span > div")
|
||||||
|
// row with no result:
|
||||||
|
// document.querySelector("#\\32 > td:nth-child(4) > span > p")
|
||||||
|
// no results:
|
||||||
|
// document.querySelector("#main-content-wrapper > section > div.noData.yf-1omxedn")
|
||||||
|
|
||||||
|
// Using a wrapper to ensure the result is properly captured
|
||||||
|
var extractionResult = (function() {
|
||||||
try {
|
try {
|
||||||
// Check for "No results found" message
|
// Check for "No results found" message using exact selector
|
||||||
const noDataElement = document.querySelector('.noData');
|
const noDataElement = document.querySelector('#main-content-wrapper > section > div.noData.yf-1omxedn');
|
||||||
if (noDataElement) {
|
if (noDataElement) {
|
||||||
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the results table
|
// Find the results table using exact selector
|
||||||
const table = document.querySelector('table.markets-table');
|
const table = document.querySelector('#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table');
|
||||||
if (!table) {
|
if (!table) {
|
||||||
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the first row in tbody
|
// Find all rows in tbody
|
||||||
const firstRow = table.querySelector('tbody tr');
|
const allRows = table.querySelectorAll('tbody tr');
|
||||||
if (!firstRow) {
|
if (!allRows || allRows.length === 0) {
|
||||||
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract ticker from first column (td:nth-child(1))
|
// Helper function to safely extract text content
|
||||||
const tickerCell = firstRow.querySelector('td:nth-child(1)');
|
function extractText(element) {
|
||||||
const ticker = tickerCell ? tickerCell.textContent.trim() : '';
|
if (!element) return '';
|
||||||
|
const text = element.textContent.trim();
|
||||||
if (!ticker) {
|
return text;
|
||||||
return { status: 'not_found', ticker: null, sector: null, exchange: null };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract sector from column 4 (td:nth-child(4) > span > div > a)
|
// Helper function to check if a cell actually contains data
|
||||||
const sectorCell = firstRow.querySelector('td:nth-child(4) span div a');
|
// Multiple indicators are used to determine if data is present
|
||||||
let sector = sectorCell ? sectorCell.textContent.trim() : '';
|
function hasValidData(cellElement) {
|
||||||
|
if (!cellElement) return false;
|
||||||
|
|
||||||
// Normalize empty/invalid values to null
|
// Indicator 1: Check if the cell contains a <p> tag (Yahoo uses this for "no data")
|
||||||
if (!sector || sector === '-' || sector === 'N/A') {
|
const pTag = cellElement.querySelector('p');
|
||||||
sector = null;
|
if (pTag) return false;
|
||||||
|
|
||||||
|
// Indicator 2: Check the direct child structure
|
||||||
|
// Valid data cells have: td > span > div or td > span > div > a
|
||||||
|
// Invalid data cells have: td > span > p
|
||||||
|
const span = cellElement.querySelector('span');
|
||||||
|
if (span) {
|
||||||
|
const directChildren = Array.from(span.children);
|
||||||
|
// If the only or first child is a <p>, it's likely "no data"
|
||||||
|
if (directChildren.length > 0 && directChildren[0].tagName === 'P') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract exchange from column 6 (td:nth-child(6) > span)
|
// Indicator 3: Check text content
|
||||||
const exchangeCell = firstRow.querySelector('td:nth-child(6) span');
|
const text = extractText(cellElement);
|
||||||
let exchange = exchangeCell ? exchangeCell.textContent.trim() : '';
|
if (!text) return false;
|
||||||
|
const normalized = text.toLowerCase().trim();
|
||||||
|
|
||||||
// Normalize empty/invalid values to null
|
// Common "no data" indicators
|
||||||
if (!exchange || exchange === '-' || exchange === 'N/A') {
|
const noDataIndicators = [
|
||||||
exchange = null;
|
'-',
|
||||||
|
'n/a',
|
||||||
|
'na',
|
||||||
|
'none',
|
||||||
|
'not available',
|
||||||
|
'no data',
|
||||||
|
'--',
|
||||||
|
'—', // em dash
|
||||||
|
'–', // en dash
|
||||||
|
];
|
||||||
|
|
||||||
|
if (noDataIndicators.includes(normalized)) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Indicator 4: Check for common CSS classes that indicate empty state
|
||||||
|
const classIndicators = ['empty', 'no-data', 'na', 'null', 'undefined'];
|
||||||
|
const classList = cellElement.className || '';
|
||||||
|
for (const indicator of classIndicators) {
|
||||||
|
if (classList.includes(indicator)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indicator 5: Check if cell has an anchor tag (usually indicates real data)
|
||||||
|
const hasLink = cellElement.querySelector('a') !== null;
|
||||||
|
|
||||||
|
// Indicator 6: Check if there's actual substantial content
|
||||||
|
// If text is very short (1-2 chars) and not alphanumeric, it's likely not real data
|
||||||
|
if (text.length <= 2 && !/[a-zA-Z0-9]/.test(text)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we passed all checks, consider it valid data
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to extract and normalize data from a cell
|
||||||
|
function extractCellData(cellElement) {
|
||||||
|
if (!cellElement) return null;
|
||||||
|
if (!hasValidData(cellElement)) return null;
|
||||||
|
|
||||||
|
const text = extractText(cellElement);
|
||||||
|
return text || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to extract and normalize data from a row
|
||||||
|
function extractRowData(row) {
|
||||||
|
// Extract ticker from column 1 (td:nth-child(1))
|
||||||
|
const tickerCell = row.querySelector('td:nth-child(1)');
|
||||||
|
const ticker = extractCellData(tickerCell);
|
||||||
|
|
||||||
|
// Extract sector from column 4 (td:nth-child(4))
|
||||||
|
const sectorCell = row.querySelector('td:nth-child(4)');
|
||||||
|
const sector = extractCellData(sectorCell);
|
||||||
|
|
||||||
|
// Extract exchange from column 6 (td:nth-child(6))
|
||||||
|
const exchangeCell = row.querySelector('td:nth-child(6)');
|
||||||
|
const exchange = extractCellData(exchangeCell);
|
||||||
|
|
||||||
|
return { ticker, sector, exchange };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to count non-null fields (data completeness counter)
|
||||||
|
function countValidFields(data) {
|
||||||
|
let count = 0;
|
||||||
|
if (data.ticker) count++;
|
||||||
|
if (data.sector) count++;
|
||||||
|
if (data.exchange) count++;
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to score a row (prioritize rows with more complete data)
|
||||||
|
function scoreRow(data) {
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
// Ticker is mandatory and gets highest weight
|
||||||
|
if (data.ticker) score += 100;
|
||||||
|
|
||||||
|
// Sector and exchange are nice-to-have
|
||||||
|
if (data.sector) score += 10;
|
||||||
|
if (data.exchange) score += 10;
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract data from all rows and find the one with most complete data
|
||||||
|
let bestRow = null;
|
||||||
|
let maxScore = -1;
|
||||||
|
let rowIndex = 0;
|
||||||
|
|
||||||
|
for (const row of allRows) {
|
||||||
|
const data = extractRowData(row);
|
||||||
|
const score = scoreRow(data);
|
||||||
|
|
||||||
|
// Select row with highest score (most complete data)
|
||||||
|
// If tied, first row wins
|
||||||
|
if (score > maxScore) {
|
||||||
|
bestRow = data;
|
||||||
|
maxScore = score;
|
||||||
|
bestRow.rowIndex = rowIndex;
|
||||||
|
bestRow.validFieldCount = countValidFields(data);
|
||||||
|
bestRow.score = score;
|
||||||
|
}
|
||||||
|
|
||||||
|
rowIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ticker is mandatory - return error status if not found
|
||||||
|
if (!bestRow || !bestRow.ticker) {
|
||||||
|
return {
|
||||||
|
status: 'error',
|
||||||
|
error_message: 'No ticker found in any row',
|
||||||
|
ticker: null,
|
||||||
|
sector: null,
|
||||||
|
exchange: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return success with ticker (mandatory) and optional sector/exchange
|
||||||
|
// Include metadata about which row was selected and how many valid fields it had
|
||||||
return {
|
return {
|
||||||
status: 'found',
|
status: 'found',
|
||||||
ticker: ticker,
|
ticker: bestRow.ticker,
|
||||||
sector: sector,
|
sector: bestRow.sector,
|
||||||
exchange: exchange
|
exchange: bestRow.exchange,
|
||||||
|
metadata: {
|
||||||
|
selectedRowIndex: bestRow.rowIndex,
|
||||||
|
validFieldCount: bestRow.validFieldCount,
|
||||||
|
score: bestRow.score,
|
||||||
|
totalRows: allRows.length
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
// Only catch unexpected errors during extraction
|
||||||
return {
|
return {
|
||||||
status: 'error',
|
status: 'error',
|
||||||
error_message: error.toString(),
|
error_message: error.toString(),
|
||||||
@@ -65,3 +218,6 @@
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
|
|
||||||
|
// Return the result explicitly
|
||||||
|
return extractionResult;
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
// src/economic/scraper.rs
|
// src/economic/scraper.rs
|
||||||
use super::types::{EconomicEvent};
|
use super::types::{EconomicEvent};
|
||||||
use event_backtest_engine::logger;
|
use crate::logger;
|
||||||
use fantoccini::Client;
|
use fantoccini::Client;
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
|
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ pub async fn load_events_in_batches(
|
|||||||
Ok(all_events.into_iter())
|
Ok(all_events.into_iter())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// NEW: Build a lightweight index instead of loading all events
|
/// Build a lightweight index instead of loading all events
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct EventIndex {
|
pub struct EventIndex {
|
||||||
pub key: String,
|
pub key: String,
|
||||||
|
|||||||
@@ -6,10 +6,12 @@
|
|||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod scraper;
|
pub mod scraper;
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
pub mod monitoring;
|
||||||
|
pub mod economic;
|
||||||
|
pub mod corporate;
|
||||||
|
|
||||||
// Re-export commonly used types for convenience
|
// Re-export commonly used types for convenience
|
||||||
|
pub use monitoring::{init_monitoring, ConfigSnapshot, MonitoringEvent};
|
||||||
pub use config::Config;
|
pub use config::Config;
|
||||||
pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask};
|
pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask};
|
||||||
pub use util::directories::DataPaths;
|
|
||||||
pub use util::logger;
|
pub use util::logger;
|
||||||
pub use util::opnv;
|
|
||||||
77
src/main.rs
77
src/main.rs
@@ -1,30 +1,70 @@
|
|||||||
// src/main.rs
|
// src/main.rs
|
||||||
|
|
||||||
mod config;
|
use web_scraper::{*, scraper, economic, corporate};
|
||||||
mod corporate;
|
|
||||||
mod economic;
|
|
||||||
mod util;
|
|
||||||
mod scraper;
|
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use config::Config;
|
use web_scraper::config::Config;
|
||||||
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
|
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
|
||||||
use scraper::webdriver::ChromeDriverPool;
|
use scraper::webdriver::ChromeDriverPool;
|
||||||
use util::directories::DataPaths;
|
use util::directories::DataPaths;
|
||||||
use util::{logger, opnv};
|
use util::{logger, opnv};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
|
let output = if cfg!(target_os = "windows") {
|
||||||
|
Command::new("cmd")
|
||||||
|
.args(["/C", "docker desktop start"])
|
||||||
|
.output()
|
||||||
|
.expect("failed to execute process")
|
||||||
|
} else {
|
||||||
|
Command::new("sh")
|
||||||
|
.arg("-c")
|
||||||
|
.arg("echo hello")
|
||||||
|
.output()
|
||||||
|
.expect("failed to execute process")
|
||||||
|
};
|
||||||
|
let _start_docker_desktop = output.stdout;
|
||||||
|
|
||||||
cleanup_all_proxy_containers().await.ok();
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
|
||||||
let config = Config::load().map_err(|err| {
|
let config = match Config::load() {
|
||||||
eprintln!("Failed to load config: {}", err);
|
Ok(cfg) => cfg,
|
||||||
err
|
Err(_) => {
|
||||||
})?;
|
eprintln!("Using default configuration");
|
||||||
|
Config::default()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let paths = DataPaths::new(".")?;
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
|
// Initialize monitoring system
|
||||||
|
let config_snapshot = ConfigSnapshot {
|
||||||
|
max_parallel_instances: config.max_parallel_instances,
|
||||||
|
max_tasks_per_instance: config.max_tasks_per_instance,
|
||||||
|
enable_vpn_rotation: config.enable_vpn_rotation,
|
||||||
|
max_requests_per_session: config.max_requests_per_session,
|
||||||
|
min_request_interval_ms: config.min_request_interval_ms,
|
||||||
|
max_retry_attempts: config.max_retry_attempts,
|
||||||
|
};
|
||||||
|
|
||||||
|
let (monitoring_handle, _monitoring_task) = init_monitoring(
|
||||||
|
config_snapshot,
|
||||||
|
paths.logs_dir().to_path_buf(),
|
||||||
|
3030, // Dashboard port
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
// Emit pool initialization event
|
||||||
|
monitoring_handle.emit(monitoring::MonitoringEvent::PoolInitialized {
|
||||||
|
pool_size: config.max_parallel_instances,
|
||||||
|
with_proxy: config.enable_vpn_rotation,
|
||||||
|
with_rotation: config.max_tasks_per_instance > 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
logger::log_info("Monitoring dashboard available at http://localhost:3030").await;
|
||||||
|
|
||||||
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
||||||
logger::log_info("=== Event Backtest Engine Started ===").await;
|
logger::log_info("=== Event Backtest Engine Started ===").await;
|
||||||
logger::log_info(&format!(
|
logger::log_info(&format!(
|
||||||
@@ -40,7 +80,8 @@ async fn main() -> Result<()> {
|
|||||||
// === Step 1: Fetch VPNBook configs ===
|
// === Step 1: Fetch VPNBook configs ===
|
||||||
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
|
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
|
||||||
logger::log_info("VPN Rotation Enabled – Fetching latest VPNBook configs").await;
|
logger::log_info("VPN Rotation Enabled – Fetching latest VPNBook configs").await;
|
||||||
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(config.max_parallel_instances, None, config.max_tasks_per_instance).await?);
|
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(None, &config, Some(monitoring_handle.clone())).await?);
|
||||||
|
|
||||||
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
||||||
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
||||||
|
|
||||||
@@ -56,6 +97,16 @@ async fn main() -> Result<()> {
|
|||||||
let pp = Arc::new(DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password).await?);
|
let pp = Arc::new(DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password).await?);
|
||||||
|
|
||||||
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
|
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
|
||||||
|
for i in 0..pp.num_proxies() {
|
||||||
|
if let Some(proxy_info) = pp.get_proxy_info(i) {
|
||||||
|
monitoring_handle.emit(monitoring::MonitoringEvent::ProxyConnected {
|
||||||
|
container_name: proxy_info.container_name.clone(),
|
||||||
|
ip_address: proxy_info.ip_address.clone(),
|
||||||
|
port: proxy_info.port,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Some(pp)
|
Some(pp)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -71,9 +122,9 @@ async fn main() -> Result<()> {
|
|||||||
|
|
||||||
let pool = Arc::new(
|
let pool = Arc::new(
|
||||||
if task_limit > 0 {
|
if task_limit > 0 {
|
||||||
ChromeDriverPool::new_with_proxy_and_task_limit(pool_size, proxy_pool.clone(), task_limit).await?
|
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await?
|
||||||
} else {
|
} else {
|
||||||
ChromeDriverPool::new_with_proxy(pool_size, proxy_pool.clone()).await?
|
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await?
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
644
src/monitoring/dashboard.html
Normal file
644
src/monitoring/dashboard.html
Normal file
@@ -0,0 +1,644 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Scraper Monitoring Dashboard</title>
|
||||||
|
<style>
|
||||||
|
* {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
background: #1a1a1a;
|
||||||
|
color: #f0f0f0;
|
||||||
|
padding: 20px;
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header {
|
||||||
|
text-align: center;
|
||||||
|
padding: 20px;
|
||||||
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||||
|
border-radius: 8px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header h1 {
|
||||||
|
font-size: 28px;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header .uptime {
|
||||||
|
font-size: 14px;
|
||||||
|
opacity: 0.9;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section {
|
||||||
|
background: #2a2a2a;
|
||||||
|
border: 2px solid #444;
|
||||||
|
padding: 15px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
border-radius: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section-title {
|
||||||
|
font-size: 16px;
|
||||||
|
font-weight: bold;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
padding-bottom: 8px;
|
||||||
|
border-bottom: 2px solid #667eea;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Config Section */
|
||||||
|
.config-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(3, 1fr);
|
||||||
|
gap: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.config-item {
|
||||||
|
background: #333;
|
||||||
|
padding: 12px;
|
||||||
|
border-radius: 4px;
|
||||||
|
border-left: 3px solid #667eea;
|
||||||
|
}
|
||||||
|
|
||||||
|
.config-label {
|
||||||
|
color: #888;
|
||||||
|
font-size: 11px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.config-value {
|
||||||
|
color: #4CAF50;
|
||||||
|
font-size: 18px;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Instance Grid */
|
||||||
|
.instance-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(450px, 1fr));
|
||||||
|
gap: 15px;
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box {
|
||||||
|
background: #333;
|
||||||
|
border: 2px solid #555;
|
||||||
|
border-radius: 5px;
|
||||||
|
padding: 0;
|
||||||
|
display: flex;
|
||||||
|
gap: 0;
|
||||||
|
overflow: hidden;
|
||||||
|
transition: border-color 0.3s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-idle {
|
||||||
|
border-color: #666;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-active {
|
||||||
|
border-color: #4CAF50;
|
||||||
|
box-shadow: 0 0 10px rgba(76, 175, 80, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-renewing {
|
||||||
|
border-color: #FF9800;
|
||||||
|
box-shadow: 0 0 10px rgba(255, 152, 0, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-error {
|
||||||
|
border-color: #f44336;
|
||||||
|
box-shadow: 0 0 10px rgba(244, 67, 54, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-side,
|
||||||
|
.proxy-side {
|
||||||
|
flex: 1;
|
||||||
|
padding: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-side {
|
||||||
|
background: #3a3a3a;
|
||||||
|
border-right: 1px solid #555;
|
||||||
|
}
|
||||||
|
|
||||||
|
.proxy-side {
|
||||||
|
background: #2a3a4a;
|
||||||
|
}
|
||||||
|
|
||||||
|
.side-header {
|
||||||
|
font-weight: bold;
|
||||||
|
font-size: 14px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
padding-bottom: 5px;
|
||||||
|
border-bottom: 1px solid #555;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 8px;
|
||||||
|
border-radius: 3px;
|
||||||
|
font-size: 11px;
|
||||||
|
font-weight: bold;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.idle {
|
||||||
|
background: #666;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.active {
|
||||||
|
background: #4CAF50;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.renewing {
|
||||||
|
background: #FF9800;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.error {
|
||||||
|
background: #f44336;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-row {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding: 4px 0;
|
||||||
|
font-size: 12px;
|
||||||
|
border-bottom: 1px solid #444;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-row:last-child {
|
||||||
|
border-bottom: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-label {
|
||||||
|
color: #888;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-value {
|
||||||
|
color: #4CAF50;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-value.warning {
|
||||||
|
color: #FF9800;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-value.danger {
|
||||||
|
color: #f44336;
|
||||||
|
}
|
||||||
|
|
||||||
|
.current-url {
|
||||||
|
margin-top: 8px;
|
||||||
|
padding-top: 8px;
|
||||||
|
border-top: 1px solid #555;
|
||||||
|
font-size: 11px;
|
||||||
|
color: #aaa;
|
||||||
|
word-wrap: break-word;
|
||||||
|
}
|
||||||
|
|
||||||
|
.no-proxy {
|
||||||
|
text-align: center;
|
||||||
|
color: #666;
|
||||||
|
padding: 30px 10px;
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Global Stats */
|
||||||
|
.stats-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||||
|
gap: 12px;
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-box {
|
||||||
|
background: #333;
|
||||||
|
padding: 15px;
|
||||||
|
border-radius: 5px;
|
||||||
|
text-align: center;
|
||||||
|
border-left: 4px solid #667eea;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-value {
|
||||||
|
font-size: 28px;
|
||||||
|
font-weight: bold;
|
||||||
|
color: #4CAF50;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-label {
|
||||||
|
font-size: 11px;
|
||||||
|
color: #888;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Logs */
|
||||||
|
.log-container {
|
||||||
|
max-height: 300px;
|
||||||
|
overflow-y: auto;
|
||||||
|
background: #1a1a1a;
|
||||||
|
padding: 10px;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-container::-webkit-scrollbar {
|
||||||
|
width: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-container::-webkit-scrollbar-track {
|
||||||
|
background: #2a2a2a;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-container::-webkit-scrollbar-thumb {
|
||||||
|
background: #667eea;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-entry {
|
||||||
|
padding: 4px 0;
|
||||||
|
border-bottom: 1px solid #333;
|
||||||
|
display: flex;
|
||||||
|
gap: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-entry:last-child {
|
||||||
|
border-bottom: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-time {
|
||||||
|
color: #666;
|
||||||
|
font-weight: bold;
|
||||||
|
min-width: 70px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message {
|
||||||
|
flex: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message.info {
|
||||||
|
color: #4CAF50;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message.warn {
|
||||||
|
color: #FF9800;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message.error {
|
||||||
|
color: #f44336;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-status {
|
||||||
|
position: fixed;
|
||||||
|
top: 20px;
|
||||||
|
right: 20px;
|
||||||
|
padding: 8px 15px;
|
||||||
|
border-radius: 20px;
|
||||||
|
font-size: 12px;
|
||||||
|
font-weight: bold;
|
||||||
|
z-index: 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-status.connected {
|
||||||
|
background: #4CAF50;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-status.disconnected {
|
||||||
|
background: #f44336;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes pulse {
|
||||||
|
0%, 100% {
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
50% {
|
||||||
|
opacity: 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.pulse {
|
||||||
|
animation: pulse 2s infinite;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="connection-status" id="connection-status">
|
||||||
|
Connecting...
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="header">
|
||||||
|
<h1>🚀 Scraper Monitoring Dashboard</h1>
|
||||||
|
<div class="uptime" id="uptime">Uptime: Loading...</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Config Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">⚙️ CONFIGURATION</div>
|
||||||
|
<div class="config-grid" id="config"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Pool Status Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">🔧 POOL STATUS</div>
|
||||||
|
<div class="instance-grid" id="instances"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Global Metrics Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">📊 GLOBAL METRICS</div>
|
||||||
|
<div class="stats-grid" id="global-stats"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Logs Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">📝 RECENT LOGS</div>
|
||||||
|
<div class="log-container" id="logs"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let ws = null;
|
||||||
|
let reconnectInterval = null;
|
||||||
|
|
||||||
|
function connect() {
|
||||||
|
ws = new WebSocket('ws://' + window.location.host + '/ws');
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
console.log('WebSocket connected');
|
||||||
|
updateConnectionStatus(true);
|
||||||
|
if (reconnectInterval) {
|
||||||
|
clearInterval(reconnectInterval);
|
||||||
|
reconnectInterval = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onmessage = (event) => {
|
||||||
|
try {
|
||||||
|
const state = JSON.parse(event.data);
|
||||||
|
updateDashboard(state);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to parse message:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onclose = () => {
|
||||||
|
console.log('WebSocket disconnected');
|
||||||
|
updateConnectionStatus(false);
|
||||||
|
// Attempt to reconnect every 3 seconds
|
||||||
|
if (!reconnectInterval) {
|
||||||
|
reconnectInterval = setInterval(() => {
|
||||||
|
console.log('Attempting to reconnect...');
|
||||||
|
connect();
|
||||||
|
}, 3000);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onerror = (error) => {
|
||||||
|
console.error('WebSocket error:', error);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateConnectionStatus(connected) {
|
||||||
|
const status = document.getElementById('connection-status');
|
||||||
|
if (connected) {
|
||||||
|
status.textContent = '● Connected';
|
||||||
|
status.className = 'connection-status connected';
|
||||||
|
} else {
|
||||||
|
status.textContent = '● Disconnected';
|
||||||
|
status.className = 'connection-status disconnected pulse';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateDashboard(state) {
|
||||||
|
updateConfig(state.config);
|
||||||
|
updateInstances(state.instances);
|
||||||
|
updateGlobalStats(state.global);
|
||||||
|
updateLogs(state.logs);
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateConfig(config) {
|
||||||
|
const container = document.getElementById('config');
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Parallel Instances</div>
|
||||||
|
<div class="config-value">${config.max_parallel_instances}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Tasks per Instance</div>
|
||||||
|
<div class="config-value">${config.max_tasks_per_instance || 'Unlimited'}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">VPN Rotation</div>
|
||||||
|
<div class="config-value">${config.enable_vpn_rotation ? '✓ Enabled' : '✗ Disabled'}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Requests per Session</div>
|
||||||
|
<div class="config-value">${config.max_requests_per_session}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Min Request Interval</div>
|
||||||
|
<div class="config-value">${config.min_request_interval_ms}ms</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Max Retry Attempts</div>
|
||||||
|
<div class="config-value">${config.max_retry_attempts}</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateInstances(instances) {
|
||||||
|
const container = document.getElementById('instances');
|
||||||
|
if (!instances || instances.length === 0) {
|
||||||
|
container.innerHTML = '<div style="text-align: center; padding: 40px; color: #666;">No instances available</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
container.innerHTML = instances.map(inst => {
|
||||||
|
const statusClass = `status-${inst.status}`;
|
||||||
|
const proxy = inst.connected_proxy;
|
||||||
|
|
||||||
|
const successRate = inst.total_requests > 0
|
||||||
|
? ((inst.success_count / inst.total_requests) * 100).toFixed(1)
|
||||||
|
: '0.0';
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="instance-box ${statusClass}">
|
||||||
|
<div class="instance-side">
|
||||||
|
<div class="side-header">
|
||||||
|
🖥️ Instance #${inst.id}
|
||||||
|
<span class="status-badge ${inst.status}">${inst.status}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Current Tasks</span>
|
||||||
|
<span class="metric-value ${inst.tasks_current_session >= inst.tasks_max ? 'warning' : ''}">
|
||||||
|
${inst.tasks_current_session}/${inst.tasks_max}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Session Requests</span>
|
||||||
|
<span class="metric-value">${inst.session_requests}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Total Requests</span>
|
||||||
|
<span class="metric-value">${inst.total_requests}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Success / Fail</span>
|
||||||
|
<span class="metric-value">${inst.success_count} / ${inst.failure_count}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Success Rate</span>
|
||||||
|
<span class="metric-value ${successRate < 50 ? 'danger' : successRate < 80 ? 'warning' : ''}">
|
||||||
|
${successRate}%
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Last Activity</span>
|
||||||
|
<span class="metric-value">${inst.last_activity}</span>
|
||||||
|
</div>
|
||||||
|
${inst.current_task ? `
|
||||||
|
<div class="current-url">
|
||||||
|
<strong>Current URL:</strong><br>
|
||||||
|
${escapeHtml(inst.current_task)}
|
||||||
|
</div>
|
||||||
|
` : ''}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
${proxy ? `
|
||||||
|
<div class="proxy-side">
|
||||||
|
<div class="side-header">
|
||||||
|
📡 ${proxy.container_name}
|
||||||
|
<span class="status-badge ${proxy.status}">${proxy.status}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">IP Address</span>
|
||||||
|
<span class="metric-value">${proxy.ip_address}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Port</span>
|
||||||
|
<span class="metric-value">${proxy.port}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Status</span>
|
||||||
|
<span class="metric-value">${proxy.status}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
` : `
|
||||||
|
<div class="proxy-side">
|
||||||
|
<div class="no-proxy">
|
||||||
|
🌐<br>
|
||||||
|
Direct Connection<br>
|
||||||
|
(No Proxy)
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateGlobalStats(global) {
|
||||||
|
const container = document.getElementById('global-stats');
|
||||||
|
|
||||||
|
const uptime = document.getElementById('uptime');
|
||||||
|
uptime.textContent = `Uptime: ${formatUptime(global.uptime_seconds)}`;
|
||||||
|
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.total_requests}</div>
|
||||||
|
<div class="stat-label">Total Requests</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.success_rate.toFixed(1)}%</div>
|
||||||
|
<div class="stat-label">Success Rate</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.successful_requests}</div>
|
||||||
|
<div class="stat-label">Successful</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.failed_requests}</div>
|
||||||
|
<div class="stat-label">Failed</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.session_renewals}</div>
|
||||||
|
<div class="stat-label">Session Renewals</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.rotation_events}</div>
|
||||||
|
<div class="stat-label">Rotation Events</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.navigation_timeouts}</div>
|
||||||
|
<div class="stat-label">Timeouts</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.bot_detection_hits}</div>
|
||||||
|
<div class="stat-label">Bot Detection</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.proxy_failures}</div>
|
||||||
|
<div class="stat-label">Proxy Failures</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateLogs(logs) {
|
||||||
|
const container = document.getElementById('logs');
|
||||||
|
const wasScrolledToBottom = container.scrollHeight - container.scrollTop === container.clientHeight;
|
||||||
|
|
||||||
|
container.innerHTML = logs.map(log => `
|
||||||
|
<div class="log-entry">
|
||||||
|
<span class="log-time">${log.timestamp}</span>
|
||||||
|
<span class="log-message ${log.level}">${escapeHtml(log.message)}</span>
|
||||||
|
</div>
|
||||||
|
`).join('');
|
||||||
|
|
||||||
|
// Auto-scroll to bottom if user was already at bottom
|
||||||
|
if (wasScrolledToBottom) {
|
||||||
|
container.scrollTop = container.scrollHeight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatUptime(seconds) {
|
||||||
|
const hours = Math.floor(seconds / 3600);
|
||||||
|
const minutes = Math.floor((seconds % 3600) / 60);
|
||||||
|
const secs = seconds % 60;
|
||||||
|
return `${hours}h ${minutes}m ${secs}s`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function escapeHtml(text) {
|
||||||
|
const map = {
|
||||||
|
'&': '&',
|
||||||
|
'<': '<',
|
||||||
|
'>': '>',
|
||||||
|
'"': '"',
|
||||||
|
"'": '''
|
||||||
|
};
|
||||||
|
return text.replace(/[&<>"']/g, m => map[m]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize connection
|
||||||
|
connect();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
129
src/monitoring/events.rs
Normal file
129
src/monitoring/events.rs
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
// src/monitoring/events.rs
|
||||||
|
use super::metrics::ProxyInfo;
|
||||||
|
|
||||||
|
/// Events emitted by the scraper system
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum MonitoringEvent {
|
||||||
|
// Pool initialization
|
||||||
|
PoolInitialized {
|
||||||
|
pool_size: usize,
|
||||||
|
with_proxy: bool,
|
||||||
|
with_rotation: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Instance lifecycle
|
||||||
|
InstanceCreated {
|
||||||
|
instance_id: usize,
|
||||||
|
max_tasks: usize,
|
||||||
|
proxy: Option<ProxyInfo>,
|
||||||
|
},
|
||||||
|
|
||||||
|
InstanceStatusChanged {
|
||||||
|
instance_id: usize,
|
||||||
|
status: InstanceStatusChange,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Task execution
|
||||||
|
TaskStarted {
|
||||||
|
instance_id: usize,
|
||||||
|
url: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
TaskCompleted {
|
||||||
|
instance_id: usize,
|
||||||
|
success: bool,
|
||||||
|
duration_ms: u64,
|
||||||
|
error: Option<String>,
|
||||||
|
},
|
||||||
|
|
||||||
|
NavigationTimeout {
|
||||||
|
instance_id: usize,
|
||||||
|
url: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
BotDetectionTriggered {
|
||||||
|
instance_id: usize,
|
||||||
|
url: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Session management
|
||||||
|
SessionStarted {
|
||||||
|
instance_id: usize,
|
||||||
|
proxy: Option<ProxyInfo>,
|
||||||
|
},
|
||||||
|
|
||||||
|
SessionRenewed {
|
||||||
|
instance_id: usize,
|
||||||
|
old_request_count: usize,
|
||||||
|
reason: RenewalReason,
|
||||||
|
new_proxy: Option<ProxyInfo>,
|
||||||
|
},
|
||||||
|
|
||||||
|
SessionRequestIncremented {
|
||||||
|
instance_id: usize,
|
||||||
|
new_count: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Proxy events
|
||||||
|
ProxyConnected {
|
||||||
|
container_name: String,
|
||||||
|
ip_address: String,
|
||||||
|
port: u16,
|
||||||
|
},
|
||||||
|
|
||||||
|
ProxyFailed {
|
||||||
|
container_name: String,
|
||||||
|
error: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
ProxyRotated {
|
||||||
|
instance_id: usize,
|
||||||
|
old_proxy: Option<String>,
|
||||||
|
new_proxy: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Pool rotation events
|
||||||
|
RotationTriggered {
|
||||||
|
reason: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Logging
|
||||||
|
LogMessage {
|
||||||
|
level: LogLevel,
|
||||||
|
message: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum InstanceStatusChange {
|
||||||
|
Idle,
|
||||||
|
Active,
|
||||||
|
Renewing,
|
||||||
|
Error(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum RenewalReason {
|
||||||
|
TaskLimit,
|
||||||
|
RequestLimit,
|
||||||
|
Error,
|
||||||
|
Manual,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum LogLevel {
|
||||||
|
Info,
|
||||||
|
Warn,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for RenewalReason {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
RenewalReason::TaskLimit => write!(f, "task_limit"),
|
||||||
|
RenewalReason::RequestLimit => write!(f, "request_limit"),
|
||||||
|
RenewalReason::Error => write!(f, "error"),
|
||||||
|
RenewalReason::Manual => write!(f, "manual"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
103
src/monitoring/logger.rs
Normal file
103
src/monitoring/logger.rs
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
// src/monitoring/logger.rs
|
||||||
|
use super::metrics::SessionSummary;
|
||||||
|
use chrono::Local;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use tokio::fs::OpenOptions;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
|
/// Logs session summaries to JSONL files
|
||||||
|
pub struct SessionLogger {
|
||||||
|
log_dir: PathBuf,
|
||||||
|
file: Mutex<Option<tokio::fs::File>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SessionLogger {
|
||||||
|
pub fn new(log_dir: PathBuf) -> Self {
|
||||||
|
Self {
|
||||||
|
log_dir,
|
||||||
|
file: Mutex::new(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Log a completed session summary
|
||||||
|
pub async fn log_session(&self, summary: &SessionSummary) {
|
||||||
|
if let Err(e) = self.write_session(summary).await {
|
||||||
|
eprintln!("Failed to log session: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn write_session(&self, summary: &SessionSummary) -> anyhow::Result<()> {
|
||||||
|
let mut file_guard = self.file.lock().await;
|
||||||
|
|
||||||
|
// Open file if not already open
|
||||||
|
if file_guard.is_none() {
|
||||||
|
let filename = format!(
|
||||||
|
"sessions_{}.jsonl",
|
||||||
|
Local::now().format("%Y%m%d")
|
||||||
|
);
|
||||||
|
let filepath = self.log_dir.join(filename);
|
||||||
|
|
||||||
|
tokio::fs::create_dir_all(&self.log_dir).await?;
|
||||||
|
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&filepath)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
*file_guard = Some(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(file) = file_guard.as_mut() {
|
||||||
|
let json_line = serde_json::to_string(summary)?;
|
||||||
|
file.write_all(json_line.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Logs metrics snapshots periodically
|
||||||
|
pub struct MetricsLogger {
|
||||||
|
log_dir: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricsLogger {
|
||||||
|
pub fn new(log_dir: PathBuf) -> Self {
|
||||||
|
Self { log_dir }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Log a metrics snapshot
|
||||||
|
pub async fn log_metrics(&self, state: &super::metrics::DashboardState) -> anyhow::Result<()> {
|
||||||
|
let filename = format!(
|
||||||
|
"metrics_{}.jsonl",
|
||||||
|
Local::now().format("%Y%m%d")
|
||||||
|
);
|
||||||
|
let filepath = self.log_dir.join(filename);
|
||||||
|
|
||||||
|
tokio::fs::create_dir_all(&self.log_dir).await?;
|
||||||
|
|
||||||
|
let mut file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&filepath)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let snapshot = serde_json::json!({
|
||||||
|
"timestamp": Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||||
|
"global": state.global,
|
||||||
|
"instance_count": state.instances.len(),
|
||||||
|
"proxy_count": state.proxies.len(),
|
||||||
|
});
|
||||||
|
|
||||||
|
let json_line = serde_json::to_string(&snapshot)?;
|
||||||
|
file.write_all(json_line.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
252
src/monitoring/metrics.rs
Normal file
252
src/monitoring/metrics.rs
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
// src/monitoring/metrics.rs
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
/// Complete dashboard state sent to web clients
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct DashboardState {
|
||||||
|
pub config: ConfigSnapshot,
|
||||||
|
pub instances: Vec<InstanceMetrics>,
|
||||||
|
pub proxies: Vec<ProxyMetrics>,
|
||||||
|
pub global: GlobalMetrics,
|
||||||
|
pub logs: Vec<LogEntry>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Snapshot of configuration settings
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ConfigSnapshot {
|
||||||
|
pub max_parallel_instances: usize,
|
||||||
|
pub max_tasks_per_instance: usize,
|
||||||
|
pub enable_vpn_rotation: bool,
|
||||||
|
pub max_requests_per_session: usize,
|
||||||
|
pub min_request_interval_ms: u64,
|
||||||
|
pub max_retry_attempts: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics for a single ChromeDriver instance
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct InstanceMetrics {
|
||||||
|
pub id: usize,
|
||||||
|
pub status: InstanceStatus,
|
||||||
|
pub current_task: Option<String>,
|
||||||
|
pub tasks_current_session: usize,
|
||||||
|
pub tasks_max: usize,
|
||||||
|
pub session_requests: usize,
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub success_count: usize,
|
||||||
|
pub failure_count: usize,
|
||||||
|
pub connected_proxy: Option<ProxyInfo>,
|
||||||
|
pub last_activity: String, // Timestamp
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum InstanceStatus {
|
||||||
|
Idle,
|
||||||
|
Active,
|
||||||
|
Renewing,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Information about a proxy connection
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ProxyInfo {
|
||||||
|
pub container_name: String,
|
||||||
|
pub ip_address: String,
|
||||||
|
pub port: u16,
|
||||||
|
pub status: ProxyStatus,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum ProxyStatus {
|
||||||
|
Connected,
|
||||||
|
Disconnected,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics for a proxy
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ProxyMetrics {
|
||||||
|
pub container_name: String,
|
||||||
|
pub ip_address: String,
|
||||||
|
pub port: u16,
|
||||||
|
pub status: ProxyStatus,
|
||||||
|
pub instances_using: Vec<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Global pool metrics
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct GlobalMetrics {
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub successful_requests: usize,
|
||||||
|
pub failed_requests: usize,
|
||||||
|
pub success_rate: f64,
|
||||||
|
pub session_renewals: usize,
|
||||||
|
pub rotation_events: usize,
|
||||||
|
pub navigation_timeouts: usize,
|
||||||
|
pub bot_detection_hits: usize,
|
||||||
|
pub proxy_failures: usize,
|
||||||
|
pub uptime_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Log entry for display in dashboard
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct LogEntry {
|
||||||
|
pub timestamp: String,
|
||||||
|
pub level: LogLevel,
|
||||||
|
pub message: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum LogLevel {
|
||||||
|
Info,
|
||||||
|
Warn,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal state tracked by monitoring service
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct MonitoringState {
|
||||||
|
pub instances: HashMap<usize, InstanceState>,
|
||||||
|
pub proxies: HashMap<String, ProxyState>,
|
||||||
|
pub global: GlobalState,
|
||||||
|
pub start_time: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct InstanceState {
|
||||||
|
pub id: usize,
|
||||||
|
pub status: InstanceStatus,
|
||||||
|
pub current_task: Option<String>,
|
||||||
|
pub tasks_current_session: usize,
|
||||||
|
pub tasks_max: usize,
|
||||||
|
pub session_requests: usize,
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub success_count: usize,
|
||||||
|
pub failure_count: usize,
|
||||||
|
pub connected_proxy: Option<ProxyInfo>,
|
||||||
|
pub last_activity: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ProxyState {
|
||||||
|
pub container_name: String,
|
||||||
|
pub ip_address: String,
|
||||||
|
pub port: u16,
|
||||||
|
pub status: ProxyStatus,
|
||||||
|
pub instances_using: Vec<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct GlobalState {
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub successful_requests: usize,
|
||||||
|
pub failed_requests: usize,
|
||||||
|
pub session_renewals: usize,
|
||||||
|
pub rotation_events: usize,
|
||||||
|
pub navigation_timeouts: usize,
|
||||||
|
pub bot_detection_hits: usize,
|
||||||
|
pub proxy_failures: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MonitoringState {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
instances: HashMap::new(),
|
||||||
|
proxies: HashMap::new(),
|
||||||
|
global: GlobalState {
|
||||||
|
total_requests: 0,
|
||||||
|
successful_requests: 0,
|
||||||
|
failed_requests: 0,
|
||||||
|
session_renewals: 0,
|
||||||
|
rotation_events: 0,
|
||||||
|
navigation_timeouts: 0,
|
||||||
|
bot_detection_hits: 0,
|
||||||
|
proxy_failures: 0,
|
||||||
|
},
|
||||||
|
start_time: Instant::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert internal state to dashboard state for web clients
|
||||||
|
pub fn to_dashboard_state(&self, config: ConfigSnapshot, logs: Vec<LogEntry>) -> DashboardState {
|
||||||
|
let instances: Vec<InstanceMetrics> = self
|
||||||
|
.instances
|
||||||
|
.values()
|
||||||
|
.map(|inst| InstanceMetrics {
|
||||||
|
id: inst.id,
|
||||||
|
status: inst.status.clone(),
|
||||||
|
current_task: inst.current_task.clone(),
|
||||||
|
tasks_current_session: inst.tasks_current_session,
|
||||||
|
tasks_max: inst.tasks_max,
|
||||||
|
session_requests: inst.session_requests,
|
||||||
|
total_requests: inst.total_requests,
|
||||||
|
success_count: inst.success_count,
|
||||||
|
failure_count: inst.failure_count,
|
||||||
|
connected_proxy: inst.connected_proxy.clone(),
|
||||||
|
last_activity: format_timestamp(inst.last_activity),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let proxies: Vec<ProxyMetrics> = self
|
||||||
|
.proxies
|
||||||
|
.values()
|
||||||
|
.map(|proxy| ProxyMetrics {
|
||||||
|
container_name: proxy.container_name.clone(),
|
||||||
|
ip_address: proxy.ip_address.clone(),
|
||||||
|
port: proxy.port,
|
||||||
|
status: proxy.status.clone(),
|
||||||
|
instances_using: proxy.instances_using.clone(),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let success_rate = if self.global.total_requests > 0 {
|
||||||
|
(self.global.successful_requests as f64 / self.global.total_requests as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
let global = GlobalMetrics {
|
||||||
|
total_requests: self.global.total_requests,
|
||||||
|
successful_requests: self.global.successful_requests,
|
||||||
|
failed_requests: self.global.failed_requests,
|
||||||
|
success_rate,
|
||||||
|
session_renewals: self.global.session_renewals,
|
||||||
|
rotation_events: self.global.rotation_events,
|
||||||
|
navigation_timeouts: self.global.navigation_timeouts,
|
||||||
|
bot_detection_hits: self.global.bot_detection_hits,
|
||||||
|
proxy_failures: self.global.proxy_failures,
|
||||||
|
uptime_seconds: self.start_time.elapsed().as_secs(),
|
||||||
|
};
|
||||||
|
|
||||||
|
DashboardState {
|
||||||
|
config,
|
||||||
|
instances,
|
||||||
|
proxies,
|
||||||
|
global,
|
||||||
|
logs,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn format_timestamp(instant: Instant) -> String {
|
||||||
|
use chrono::Local;
|
||||||
|
// This is a placeholder - in real impl we'd track actual wall-clock time
|
||||||
|
Local::now().format("%H:%M:%S").to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Session completion summary for logging
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct SessionSummary {
|
||||||
|
pub instance_id: usize,
|
||||||
|
pub session_start: String,
|
||||||
|
pub session_end: String,
|
||||||
|
pub duration_seconds: u64,
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub successful_requests: usize,
|
||||||
|
pub failed_requests: usize,
|
||||||
|
pub proxy_info: Option<ProxyInfo>,
|
||||||
|
pub renewal_reason: String, // "task_limit", "request_limit", "error"
|
||||||
|
}
|
||||||
78
src/monitoring/mod.rs
Normal file
78
src/monitoring/mod.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
// src/monitoring/mod.rs
|
||||||
|
//! Monitoring system for tracking scraper performance and health
|
||||||
|
//!
|
||||||
|
//! This module provides:
|
||||||
|
//! - Real-time metrics collection
|
||||||
|
//! - Web-based dashboard
|
||||||
|
//! - Session logging
|
||||||
|
//! - Minimal performance overhead
|
||||||
|
|
||||||
|
pub mod metrics;
|
||||||
|
pub mod events;
|
||||||
|
pub mod service;
|
||||||
|
pub mod webserver;
|
||||||
|
pub mod logger;
|
||||||
|
|
||||||
|
pub use events::{MonitoringEvent,RenewalReason, InstanceStatusChange};
|
||||||
|
pub use metrics::{ConfigSnapshot, ProxyInfo, ProxyStatus};
|
||||||
|
pub use service::{MonitoringService, MonitoringHandle};
|
||||||
|
pub use webserver::WebServer;
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::{mpsc, RwLock};
|
||||||
|
|
||||||
|
/// Initialize the complete monitoring system
|
||||||
|
pub async fn init_monitoring(
|
||||||
|
config_snapshot: ConfigSnapshot,
|
||||||
|
log_dir: PathBuf,
|
||||||
|
dashboard_port: u16,
|
||||||
|
) -> anyhow::Result<(MonitoringHandle, tokio::task::JoinHandle<()>)> {
|
||||||
|
// Create channel for events
|
||||||
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
|
|
||||||
|
// Create monitoring service
|
||||||
|
let service = MonitoringService::new(config_snapshot, rx, log_dir);
|
||||||
|
let service_arc = Arc::new(RwLock::new(service));
|
||||||
|
|
||||||
|
// Start monitoring service task
|
||||||
|
let service_clone = Arc::clone(&service_arc);
|
||||||
|
let monitoring_task = tokio::spawn(async move {
|
||||||
|
println!("🚀 MONITORING TASK STARTED!");
|
||||||
|
// Take ownership of the service
|
||||||
|
let mut service = {
|
||||||
|
let mut guard = service_clone.write().await;
|
||||||
|
std::mem::replace(
|
||||||
|
&mut *guard,
|
||||||
|
MonitoringService::new(
|
||||||
|
ConfigSnapshot {
|
||||||
|
max_parallel_instances: 0,
|
||||||
|
max_tasks_per_instance: 0,
|
||||||
|
enable_vpn_rotation: false,
|
||||||
|
max_requests_per_session: 0,
|
||||||
|
min_request_interval_ms: 0,
|
||||||
|
max_retry_attempts: 0,
|
||||||
|
},
|
||||||
|
mpsc::unbounded_channel().1,
|
||||||
|
PathBuf::new(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("✅ ABOUT TO RUN SERVICE!");
|
||||||
|
service.run().await;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Start web server
|
||||||
|
let webserver = WebServer::new(Arc::clone(&service_arc), dashboard_port);
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = webserver.run().await {
|
||||||
|
eprintln!("Web server error: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Create handle for emitting events
|
||||||
|
let handle = MonitoringHandle::new(tx);
|
||||||
|
|
||||||
|
Ok((handle, monitoring_task))
|
||||||
|
}
|
||||||
341
src/monitoring/service.rs
Normal file
341
src/monitoring/service.rs
Normal file
@@ -0,0 +1,341 @@
|
|||||||
|
// src/monitoring/service.rs
|
||||||
|
use super::events::*;
|
||||||
|
use super::metrics::*;
|
||||||
|
use super::logger::SessionLogger;
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Instant;
|
||||||
|
use tokio::sync::{mpsc, RwLock};
|
||||||
|
use chrono::Local;
|
||||||
|
|
||||||
|
const MAX_LOGS: usize = 100;
|
||||||
|
|
||||||
|
/// Monitoring service that collects events and maintains state
|
||||||
|
pub struct MonitoringService {
|
||||||
|
state: Arc<RwLock<MonitoringState>>,
|
||||||
|
config: ConfigSnapshot,
|
||||||
|
logs: Arc<RwLock<VecDeque<LogEntry>>>,
|
||||||
|
session_logger: Arc<SessionLogger>,
|
||||||
|
event_rx: mpsc::UnboundedReceiver<MonitoringEvent>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MonitoringService {
|
||||||
|
pub fn new(
|
||||||
|
config: ConfigSnapshot,
|
||||||
|
event_rx: mpsc::UnboundedReceiver<MonitoringEvent>,
|
||||||
|
log_dir: std::path::PathBuf,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
state: Arc::new(RwLock::new(MonitoringState::new())),
|
||||||
|
config,
|
||||||
|
logs: Arc::new(RwLock::new(VecDeque::with_capacity(MAX_LOGS))),
|
||||||
|
session_logger: Arc::new(SessionLogger::new(log_dir)),
|
||||||
|
event_rx,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current dashboard state for web clients
|
||||||
|
pub async fn get_dashboard_state(&self) -> DashboardState {
|
||||||
|
let state = self.state.read().await;
|
||||||
|
let logs = self.logs.read().await;
|
||||||
|
state.to_dashboard_state(
|
||||||
|
self.config.clone(),
|
||||||
|
logs.iter().cloned().collect(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Main event processing loop
|
||||||
|
pub async fn run(mut self) {
|
||||||
|
while let Some(event) = self.event_rx.recv().await {
|
||||||
|
self.process_event(event).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn process_event(&self, event: MonitoringEvent) {
|
||||||
|
match event {
|
||||||
|
MonitoringEvent::PoolInitialized { pool_size, with_proxy, with_rotation } => {
|
||||||
|
self.log_info(format!(
|
||||||
|
"Pool initialized: {} instances, proxy={}, rotation={}",
|
||||||
|
pool_size, with_proxy, with_rotation
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::InstanceCreated { instance_id, max_tasks, proxy } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.instances.insert(
|
||||||
|
instance_id,
|
||||||
|
InstanceState {
|
||||||
|
id: instance_id,
|
||||||
|
status: InstanceStatus::Idle,
|
||||||
|
current_task: None,
|
||||||
|
tasks_current_session: 0,
|
||||||
|
tasks_max: max_tasks,
|
||||||
|
session_requests: 0,
|
||||||
|
total_requests: 0,
|
||||||
|
success_count: 0,
|
||||||
|
failure_count: 0,
|
||||||
|
connected_proxy: proxy.clone(),
|
||||||
|
last_activity: Instant::now(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Some(proxy_info) = proxy {
|
||||||
|
state.proxies.entry(proxy_info.container_name.clone()).or_insert_with(|| {
|
||||||
|
ProxyState {
|
||||||
|
container_name: proxy_info.container_name.clone(),
|
||||||
|
ip_address: proxy_info.ip_address.clone(),
|
||||||
|
port: proxy_info.port,
|
||||||
|
status: ProxyStatus::Connected,
|
||||||
|
instances_using: vec![instance_id],
|
||||||
|
}
|
||||||
|
}).instances_using.push(instance_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.log_info(format!("Instance #{} created", instance_id)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::InstanceStatusChanged { instance_id, status } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.status = match status {
|
||||||
|
InstanceStatusChange::Idle => InstanceStatus::Idle,
|
||||||
|
InstanceStatusChange::Active => InstanceStatus::Active,
|
||||||
|
InstanceStatusChange::Renewing => InstanceStatus::Renewing,
|
||||||
|
InstanceStatusChange::Error(_) => InstanceStatus::Error,
|
||||||
|
};
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::TaskStarted { instance_id, url } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.status = InstanceStatus::Active;
|
||||||
|
inst.current_task = Some(url.clone());
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
state.global.total_requests += 1;
|
||||||
|
|
||||||
|
self.log_info(format!("Instance #{} started task: {}", instance_id, url)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::TaskCompleted { instance_id, success, duration_ms, error } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.current_task = None;
|
||||||
|
inst.status = InstanceStatus::Idle;
|
||||||
|
inst.total_requests += 1;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
|
||||||
|
if success {
|
||||||
|
inst.success_count += 1;
|
||||||
|
state.global.successful_requests += 1;
|
||||||
|
} else {
|
||||||
|
inst.failure_count += 1;
|
||||||
|
state.global.failed_requests += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if success {
|
||||||
|
self.log_info(format!(
|
||||||
|
"Instance #{} completed task in {}ms",
|
||||||
|
instance_id, duration_ms
|
||||||
|
)).await;
|
||||||
|
} else {
|
||||||
|
self.log_error(format!(
|
||||||
|
"Instance #{} failed task: {}",
|
||||||
|
instance_id,
|
||||||
|
error.unwrap_or_else(|| "unknown error".to_string())
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::NavigationTimeout { instance_id, url } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.navigation_timeouts += 1;
|
||||||
|
|
||||||
|
self.log_warn(format!(
|
||||||
|
"Instance #{} navigation timeout: {}",
|
||||||
|
instance_id, url
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::BotDetectionTriggered { instance_id, url } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.bot_detection_hits += 1;
|
||||||
|
|
||||||
|
self.log_warn(format!(
|
||||||
|
"Instance #{} bot detection triggered: {}",
|
||||||
|
instance_id, url
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::SessionStarted { instance_id, proxy } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.session_requests = 0;
|
||||||
|
inst.tasks_current_session = 0;
|
||||||
|
inst.connected_proxy = proxy;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.log_info(format!("Instance #{} started new session", instance_id)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::SessionRenewed { instance_id, old_request_count, reason, new_proxy } => {
|
||||||
|
// Log the completed session
|
||||||
|
let session_summary = {
|
||||||
|
let state = self.state.read().await;
|
||||||
|
if let Some(inst) = state.instances.get(&instance_id) {
|
||||||
|
Some(SessionSummary {
|
||||||
|
instance_id,
|
||||||
|
session_start: "N/A".to_string(), // We'd need to track this
|
||||||
|
session_end: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||||
|
duration_seconds: 0, // We'd need to track session start time
|
||||||
|
total_requests: old_request_count,
|
||||||
|
successful_requests: inst.success_count,
|
||||||
|
failed_requests: inst.failure_count,
|
||||||
|
proxy_info: inst.connected_proxy.clone(),
|
||||||
|
renewal_reason: reason.to_string(),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(summary) = session_summary {
|
||||||
|
self.session_logger.log_session(&summary).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update state for new session
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.session_requests = 0;
|
||||||
|
inst.tasks_current_session = 0;
|
||||||
|
inst.connected_proxy = new_proxy;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
state.global.session_renewals += 1;
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"Instance #{} renewed session (reason: {}, {} requests)",
|
||||||
|
instance_id, reason, old_request_count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::SessionRequestIncremented { instance_id, new_count } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.session_requests = new_count;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::ProxyConnected { container_name, ip_address, port } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.proxies.insert(
|
||||||
|
container_name.clone(),
|
||||||
|
ProxyState {
|
||||||
|
container_name: container_name.clone(),
|
||||||
|
ip_address: ip_address.clone(),
|
||||||
|
port,
|
||||||
|
status: ProxyStatus::Connected,
|
||||||
|
instances_using: vec![],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"Proxy {} connected: {}:{}",
|
||||||
|
container_name, ip_address, port
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::ProxyFailed { container_name, error } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(proxy) = state.proxies.get_mut(&container_name) {
|
||||||
|
proxy.status = ProxyStatus::Disconnected;
|
||||||
|
}
|
||||||
|
state.global.proxy_failures += 1;
|
||||||
|
|
||||||
|
self.log_error(format!(
|
||||||
|
"Proxy {} failed: {}",
|
||||||
|
container_name, error
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::ProxyRotated { instance_id, old_proxy, new_proxy } => {
|
||||||
|
self.log_info(format!(
|
||||||
|
"Instance #{} rotated proxy: {} -> {}",
|
||||||
|
instance_id,
|
||||||
|
old_proxy.unwrap_or_else(|| "none".to_string()),
|
||||||
|
new_proxy
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::RotationTriggered { reason } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.rotation_events += 1;
|
||||||
|
|
||||||
|
self.log_info(format!("Pool rotation triggered: {}", reason)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::LogMessage { level, message } => {
|
||||||
|
match level {
|
||||||
|
crate::monitoring::events::LogLevel::Info => self.log_info(message).await,
|
||||||
|
crate::monitoring::events::LogLevel::Warn => self.log_warn(message).await,
|
||||||
|
crate::monitoring::events::LogLevel::Error => self.log_error(message).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log_info(&self, message: String) {
|
||||||
|
self.add_log(LogEntry {
|
||||||
|
timestamp: Local::now().format("%H:%M:%S").to_string(),
|
||||||
|
level: super::metrics::LogLevel::Info,
|
||||||
|
message,
|
||||||
|
}).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log_warn(&self, message: String) {
|
||||||
|
self.add_log(LogEntry {
|
||||||
|
timestamp: Local::now().format("%H:%M:%S").to_string(),
|
||||||
|
level: super::metrics::LogLevel::Warn,
|
||||||
|
message,
|
||||||
|
}).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log_error(&self, message: String) {
|
||||||
|
self.add_log(LogEntry {
|
||||||
|
timestamp: Local::now().format("%H:%M:%S").to_string(),
|
||||||
|
level: super::metrics::LogLevel::Error,
|
||||||
|
message,
|
||||||
|
}).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn add_log(&self, entry: LogEntry) {
|
||||||
|
let mut logs = self.logs.write().await;
|
||||||
|
if logs.len() >= MAX_LOGS {
|
||||||
|
logs.pop_front();
|
||||||
|
}
|
||||||
|
logs.push_back(entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle for emitting monitoring events
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct MonitoringHandle {
|
||||||
|
tx: mpsc::UnboundedSender<MonitoringEvent>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MonitoringHandle {
|
||||||
|
pub fn new(tx: mpsc::UnboundedSender<MonitoringEvent>) -> Self {
|
||||||
|
Self { tx }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Emit a monitoring event (non-blocking)
|
||||||
|
pub fn emit(&self, event: MonitoringEvent) {
|
||||||
|
// Ignore send errors (monitoring should never block application)
|
||||||
|
let _ = self.tx.send(event);
|
||||||
|
}
|
||||||
|
}
|
||||||
77
src/monitoring/webserver.rs
Normal file
77
src/monitoring/webserver.rs
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
// src/monitoring/webserver.rs
|
||||||
|
use super::service::MonitoringService;
|
||||||
|
use axum::{
|
||||||
|
extract::{
|
||||||
|
ws::{Message, WebSocket, WebSocketUpgrade},
|
||||||
|
State,
|
||||||
|
},
|
||||||
|
response::{Html, IntoResponse, Response},
|
||||||
|
routing::get,
|
||||||
|
Router,
|
||||||
|
};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
use tokio::time::{interval, Duration};
|
||||||
|
|
||||||
|
const UPDATE_INTERVAL_MS: u64 = 1000; // 1 second updates
|
||||||
|
|
||||||
|
pub struct WebServer {
|
||||||
|
service: Arc<RwLock<MonitoringService>>,
|
||||||
|
port: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WebServer {
|
||||||
|
pub fn new(service: Arc<RwLock<MonitoringService>>, port: u16) -> Self {
|
||||||
|
Self { service, port }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn run(self) -> anyhow::Result<()> {
|
||||||
|
let app = Router::new()
|
||||||
|
.route("/", get(dashboard_handler))
|
||||||
|
.route("/ws", get(websocket_handler))
|
||||||
|
.with_state(self.service);
|
||||||
|
|
||||||
|
let addr = format!("0.0.0.0:{}", self.port);
|
||||||
|
println!("📊 Dashboard available at: http://localhost:{}", self.port);
|
||||||
|
|
||||||
|
let listener = tokio::net::TcpListener::bind(&addr).await?;
|
||||||
|
axum::serve(listener, app).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn dashboard_handler() -> impl IntoResponse {
|
||||||
|
Html(include_str!("dashboard.html"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn websocket_handler(
|
||||||
|
ws: WebSocketUpgrade,
|
||||||
|
State(service): State<Arc<RwLock<MonitoringService>>>,
|
||||||
|
) -> Response {
|
||||||
|
ws.on_upgrade(|socket| handle_socket(socket, service))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_socket(mut socket: WebSocket, service: Arc<RwLock<MonitoringService>>) {
|
||||||
|
let mut ticker = interval(Duration::from_millis(UPDATE_INTERVAL_MS));
|
||||||
|
|
||||||
|
loop {
|
||||||
|
ticker.tick().await;
|
||||||
|
|
||||||
|
let service_guard = service.read().await;
|
||||||
|
let state = service_guard.get_dashboard_state().await;
|
||||||
|
drop(service_guard);
|
||||||
|
|
||||||
|
match serde_json::to_string(&state) {
|
||||||
|
Ok(json) => {
|
||||||
|
if socket.send(Message::Text(json)).await.is_err() {
|
||||||
|
break; // Client disconnected
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Failed to serialize dashboard state: {}", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -342,6 +342,25 @@ impl DockerVpnProxyPool {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get ProxyInfo for monitoring dashboard
|
||||||
|
pub fn get_proxy_info(&self, index: usize) -> Option<crate::monitoring::ProxyInfo> {
|
||||||
|
if index >= self.container_names.len() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(crate::monitoring::ProxyInfo {
|
||||||
|
container_name: self.container_names[index].clone(),
|
||||||
|
ip_address: "127.0.0.1".to_string(), // SOCKS5 proxy on localhost
|
||||||
|
port: self.proxy_ports[index],
|
||||||
|
status: crate::monitoring::ProxyStatus::Connected,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get container name by index
|
||||||
|
pub fn get_container_name(&self, index: usize) -> Option<String> {
|
||||||
|
self.container_names.get(index).cloned()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn cleanup_all_proxy_containers() -> Result<()> {
|
pub async fn cleanup_all_proxy_containers() -> Result<()> {
|
||||||
|
|||||||
14
src/scraper/helpers.rs
Normal file
14
src/scraper/helpers.rs
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::prelude::{Rng, SeedableRng, IndexedRandom};
|
||||||
|
|
||||||
|
/// Send-safe random range
|
||||||
|
pub fn random_range(min: u64, max: u64) -> u64 {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
rng.random_range(min..max)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send-safe random choice
|
||||||
|
pub fn choose_random<T: Clone>(items: &[T]) -> T {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
items.choose(&mut rng).unwrap().clone()
|
||||||
|
}
|
||||||
@@ -1,2 +1,3 @@
|
|||||||
pub mod webdriver;
|
pub mod webdriver;
|
||||||
pub mod docker_vpn_proxy;
|
pub mod docker_vpn_proxy;
|
||||||
|
pub mod helpers;
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
// src/scraper/webdriver.rs
|
// src/scraper/webdriver.rs
|
||||||
|
use super::helpers::*;
|
||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use fantoccini::{Client, ClientBuilder};
|
use fantoccini::{Client, ClientBuilder};
|
||||||
@@ -6,12 +7,14 @@ use serde_json::{Map, Value};
|
|||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::process::Stdio;
|
use std::process::Stdio;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::time::Instant;
|
||||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||||
use tokio::process::{Child, Command};
|
use tokio::process::{Child, Command};
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use tokio::sync::{Mutex, Semaphore};
|
use tokio::sync::{Mutex, Semaphore};
|
||||||
use tokio::time::{sleep, timeout, Duration};
|
use tokio::time::{sleep, timeout, Duration};
|
||||||
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
|
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
|
||||||
|
use crate::Config;
|
||||||
|
|
||||||
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
|
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
|
||||||
pub struct ChromeDriverPool {
|
pub struct ChromeDriverPool {
|
||||||
@@ -23,25 +26,31 @@ pub struct ChromeDriverPool {
|
|||||||
rotation_enabled: bool,
|
rotation_enabled: bool,
|
||||||
/// Index for round-robin instance selection (when rotation is enabled)
|
/// Index for round-robin instance selection (when rotation is enabled)
|
||||||
next_instance: Arc<Mutex<usize>>,
|
next_instance: Arc<Mutex<usize>>,
|
||||||
|
|
||||||
|
last_request_time: Arc<Mutex<Instant>>,
|
||||||
|
min_request_interval_ms: u64,
|
||||||
|
|
||||||
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeDriverPool {
|
impl ChromeDriverPool {
|
||||||
/// Creates a new pool without any proxy (direct connection).
|
/// Creates a new pool without any proxy (direct connection).
|
||||||
pub async fn _new(pool_size: usize) -> Result<Self> {
|
pub async fn _new(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
|
||||||
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
|
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new pool with task-per-instance limit but no proxy.
|
/// Creates a new pool with task-per-instance limit but no proxy.
|
||||||
pub async fn _new_with_task_limit(pool_size: usize, max_tasks_per_instance: usize) -> Result<Self> {
|
pub async fn _new_with_task_limit(config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>,) -> Result<Self> {
|
||||||
Self::new_with_proxy_and_task_limit(pool_size, None, max_tasks_per_instance).await
|
Self::new_with_proxy_and_task_limit(None, config, monitoring).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
|
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
|
||||||
pub async fn new_with_proxy(
|
pub async fn new_with_proxy(
|
||||||
pool_size: usize,
|
|
||||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
|
config: &Config,
|
||||||
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
|
Self::new_with_proxy_and_task_limit(proxy_pool, config, monitoring).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Full constructor: supports proxy + task limiting + rotation.
|
/// Full constructor: supports proxy + task limiting + rotation.
|
||||||
@@ -55,10 +64,13 @@ impl ChromeDriverPool {
|
|||||||
///
|
///
|
||||||
/// Uses the minimum of these constraints to determine actual pool size.
|
/// Uses the minimum of these constraints to determine actual pool size.
|
||||||
pub async fn new_with_proxy_and_task_limit(
|
pub async fn new_with_proxy_and_task_limit(
|
||||||
pool_size_limit: usize,
|
|
||||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
max_tasks_per_instance: usize,
|
config: &Config,
|
||||||
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
|
let pool_size_limit = config.max_parallel_instances;
|
||||||
|
let task_per_instance_limit = config.max_tasks_per_instance;
|
||||||
|
|
||||||
// Determine actual pool size based on available resources
|
// Determine actual pool size based on available resources
|
||||||
let actual_pool_size = if let Some(ref pp) = proxy_pool {
|
let actual_pool_size = if let Some(ref pp) = proxy_pool {
|
||||||
let available_proxies = pp.num_proxies();
|
let available_proxies = pp.num_proxies();
|
||||||
@@ -72,7 +84,7 @@ impl ChromeDriverPool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Rotation is enabled when task limiting is active
|
// Rotation is enabled when task limiting is active
|
||||||
let rotation_enabled = max_tasks_per_instance > 0;
|
let rotation_enabled = task_per_instance_limit > 0;
|
||||||
|
|
||||||
let mut instances = Vec::with_capacity(actual_pool_size);
|
let mut instances = Vec::with_capacity(actual_pool_size);
|
||||||
|
|
||||||
@@ -91,34 +103,77 @@ impl ChromeDriverPool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i in 0..actual_pool_size {
|
for i in 0..actual_pool_size {
|
||||||
let proxy_url = proxy_pool
|
// Pass the entire proxy_pool and the index
|
||||||
.as_ref()
|
let instance = ChromeInstance::new(
|
||||||
.map(|pp| pp.get_proxy_url(i));
|
proxy_pool.clone(), // Clone the Arc
|
||||||
|
i, // This instance's proxy index
|
||||||
let instance = ChromeInstance::new(proxy_url, max_tasks_per_instance).await?;
|
config,
|
||||||
|
monitoring.clone(),
|
||||||
|
).await?;
|
||||||
|
|
||||||
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
||||||
instances.push(Arc::new(Mutex::new(instance)));
|
instances.push(Arc::new(Mutex::new(instance)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Emit instance created events
|
||||||
|
for (i, instance) in instances.iter().enumerate() {
|
||||||
|
if let Some(ref mon) = monitoring {
|
||||||
|
let guard = instance.lock().await;
|
||||||
|
|
||||||
|
// Extract proxy info if available
|
||||||
|
let proxy_info = if let Some(ref pp) = proxy_pool {
|
||||||
|
pp.get_proxy_info(i % pp.num_proxies())
|
||||||
|
} else {
|
||||||
|
guard.proxy_url.as_ref().and_then(|url| {
|
||||||
|
// Parse proxy URL manually if no pool
|
||||||
|
// Format: socks5://localhost:10801
|
||||||
|
if let Some(port_str) = url.split(':').last() {
|
||||||
|
if let Ok(port) = port_str.parse::<u16>() {
|
||||||
|
return Some(crate::monitoring::ProxyInfo {
|
||||||
|
container_name: format!("proxy-{}", i),
|
||||||
|
ip_address: "127.0.0.1".to_string(),
|
||||||
|
port,
|
||||||
|
status: crate::monitoring::ProxyStatus::Connected,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated {
|
||||||
|
instance_id: i,
|
||||||
|
max_tasks: guard.max_tasks_per_instance,
|
||||||
|
proxy: proxy_info.clone(), // ✅ Now includes actual proxy info
|
||||||
|
});
|
||||||
|
|
||||||
|
// Also emit ProxyConnected event if proxy exists
|
||||||
|
if let Some(ref proxy) = proxy_info {
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected {
|
||||||
|
container_name: proxy.container_name.clone(),
|
||||||
|
ip_address: proxy.ip_address.clone(),
|
||||||
|
port: proxy.port,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(guard);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let min_request_interval_ms = config.min_request_interval_ms;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
instances,
|
instances,
|
||||||
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
|
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
|
||||||
proxy_pool,
|
proxy_pool,
|
||||||
rotation_enabled,
|
rotation_enabled,
|
||||||
next_instance: Arc::new(Mutex::new(0)),
|
next_instance: Arc::new(Mutex::new(0)),
|
||||||
|
last_request_time: Arc::new(Mutex::new(Instant::now())),
|
||||||
|
min_request_interval_ms,
|
||||||
|
monitoring,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Execute a scraping task using an available instance from the pool.
|
|
||||||
///
|
|
||||||
/// When rotation is enabled:
|
|
||||||
/// - Uses only half of the instances at a time
|
|
||||||
/// - Rotates to the other half when an instance reaches its task limit
|
|
||||||
/// - Cycles through instances in round-robin fashion within the active half
|
|
||||||
///
|
|
||||||
/// When rotation is disabled:
|
|
||||||
/// - Uses all instances with random selection
|
|
||||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||||
where
|
where
|
||||||
T: Send + 'static,
|
T: Send + 'static,
|
||||||
@@ -127,108 +182,146 @@ impl ChromeDriverPool {
|
|||||||
{
|
{
|
||||||
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut last_time = self.last_request_time.lock().await;
|
||||||
|
let elapsed = last_time.elapsed().as_millis() as u64;
|
||||||
|
|
||||||
|
if elapsed < self.min_request_interval_ms {
|
||||||
|
let wait_ms = self.min_request_interval_ms - elapsed;
|
||||||
|
drop(last_time); // Lock vor Sleep freigeben!
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(wait_ms)).await;
|
||||||
|
|
||||||
|
let mut last_time = self.last_request_time.lock().await;
|
||||||
|
*last_time = Instant::now();
|
||||||
|
} else {
|
||||||
|
*last_time = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let random_index = random_range(0, self.instances.len() as u64) as usize;
|
||||||
|
// Index-Auswahl (vereinfacht, siehe unten für vollständige Rotation)
|
||||||
let index = if self.rotation_enabled {
|
let index = if self.rotation_enabled {
|
||||||
// Rotation mode: use only half of instances at a time
|
self.get_rotated_index().await?
|
||||||
let total_instances = self.instances.len();
|
|
||||||
let half_size = (total_instances + 1) / 2; // Round up for odd numbers
|
|
||||||
|
|
||||||
let mut next_idx = self.next_instance.lock().await;
|
|
||||||
let base_idx = *next_idx;
|
|
||||||
let mut selected_idx = base_idx;
|
|
||||||
let mut found_in_current_half = false;
|
|
||||||
|
|
||||||
// Try to find an available instance in the current half
|
|
||||||
for offset in 0..half_size {
|
|
||||||
let candidate_idx = (base_idx + offset) % half_size;
|
|
||||||
|
|
||||||
// Check if this instance has reached its task limit
|
|
||||||
let instance = &self.instances[candidate_idx];
|
|
||||||
let guard = instance.lock().await;
|
|
||||||
|
|
||||||
if guard.max_tasks_per_instance == 0 ||
|
|
||||||
guard.task_count < guard.max_tasks_per_instance {
|
|
||||||
// This instance is available
|
|
||||||
*next_idx = (candidate_idx + 1) % half_size;
|
|
||||||
selected_idx = candidate_idx;
|
|
||||||
found_in_current_half = true;
|
|
||||||
drop(guard);
|
|
||||||
break;
|
|
||||||
} else {
|
} else {
|
||||||
drop(guard);
|
random_index
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !found_in_current_half {
|
|
||||||
// All instances in current half are at limit, switch to other half
|
|
||||||
crate::util::logger::log_info(
|
|
||||||
"Current half saturated, rotating to other half of instances"
|
|
||||||
).await;
|
|
||||||
|
|
||||||
let other_half_start = half_size;
|
|
||||||
let other_half_size = total_instances - half_size;
|
|
||||||
|
|
||||||
// Find available instance in other half
|
|
||||||
let mut found_in_other_half = false;
|
|
||||||
for offset in 0..other_half_size {
|
|
||||||
let candidate_idx = other_half_start + offset;
|
|
||||||
|
|
||||||
let instance = &self.instances[candidate_idx];
|
|
||||||
let guard = instance.lock().await;
|
|
||||||
|
|
||||||
if guard.max_tasks_per_instance == 0 ||
|
|
||||||
guard.task_count < guard.max_tasks_per_instance {
|
|
||||||
// Switch to this half for future requests
|
|
||||||
*next_idx = offset;
|
|
||||||
selected_idx = candidate_idx;
|
|
||||||
found_in_other_half = true;
|
|
||||||
drop(guard);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
drop(guard);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !found_in_other_half {
|
|
||||||
// All instances saturated - use round-robin anyway
|
|
||||||
selected_idx = *next_idx % total_instances;
|
|
||||||
*next_idx = (*next_idx + 1) % total_instances;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(next_idx);
|
|
||||||
selected_idx
|
|
||||||
} else {
|
|
||||||
// Non-rotation mode: random selection as before
|
|
||||||
rand::random_range(..self.instances.len())
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let instance = self.instances[index].clone();
|
if let Some(ref mon) = self.monitoring {
|
||||||
let mut guard = instance.lock().await;
|
mon.emit(crate::monitoring::MonitoringEvent::TaskStarted {
|
||||||
|
instance_id: index,
|
||||||
guard.increment_task_count();
|
url: url.clone(),
|
||||||
|
});
|
||||||
if guard.max_tasks_per_instance > 0 {
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
||||||
crate::util::logger::log_info(&format!(
|
instance_id: index,
|
||||||
"Instance {} task count: {}/{}",
|
status: crate::monitoring::InstanceStatusChange::Active,
|
||||||
index,
|
});
|
||||||
guard.get_task_count(),
|
|
||||||
guard.max_tasks_per_instance
|
|
||||||
))
|
|
||||||
.await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let client = guard.new_session().await?;
|
let instance = &self.instances[index];
|
||||||
|
let mut guard = instance.lock().await;
|
||||||
|
|
||||||
drop(guard); // release lock early
|
// NEU: Session mit automatischer Erneuerung holen!
|
||||||
|
let client = guard.get_or_renew_session().await?;
|
||||||
|
|
||||||
crate::util::logger::log_info(&format!("Scraping {} ...", url)).await;
|
guard.increment_task_count();
|
||||||
client.goto(&url).await.context("Navigation failed")?;
|
let (task_count, session_requests) = guard.get_session_stats().await;
|
||||||
|
|
||||||
let result = timeout(Duration::from_secs(90), parse(client))
|
crate::util::logger::log_info(&format!(
|
||||||
.await
|
"Instance {} executing task (tasks: {}/{}, session requests: {})",
|
||||||
.context("Parse timeout")??;
|
index, task_count, guard.max_tasks_per_instance, session_requests
|
||||||
|
)).await;
|
||||||
|
|
||||||
Ok(result)
|
drop(guard); // Lock freigeben vor Navigation
|
||||||
|
|
||||||
|
let start_time = Instant::now();
|
||||||
|
|
||||||
|
// Navigation mit Timeout
|
||||||
|
let navigation_result = timeout(
|
||||||
|
Duration::from_secs(60),
|
||||||
|
client.goto(&url)
|
||||||
|
).await;
|
||||||
|
|
||||||
|
match navigation_result {
|
||||||
|
Ok(Ok(_)) => {
|
||||||
|
if let Some(ref mon) = self.monitoring {
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::TaskCompleted {
|
||||||
|
instance_id: index,
|
||||||
|
success: navigation_result.is_ok(),
|
||||||
|
duration_ms: start_time.elapsed().as_millis() as u64,
|
||||||
|
error: navigation_result.as_ref().err().map(|e| e.to_string()),
|
||||||
|
});
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
||||||
|
instance_id: index,
|
||||||
|
status: crate::monitoring::InstanceStatusChange::Idle,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
crate::util::logger::log_info(&format!("✓ Navigated to {}", url)).await;
|
||||||
|
|
||||||
|
// Parse-Funktion ausführen
|
||||||
|
parse(client).await
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
crate::util::logger::log_error(&format!("Navigation failed: {}", e)).await;
|
||||||
|
Err(anyhow!("Navigation failed: {}", e))
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
if let Some(ref mon) = self.monitoring {
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::NavigationTimeout {
|
||||||
|
instance_id: index,
|
||||||
|
url: url.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::util::logger::log_error("Navigation timeout (60s)").await;
|
||||||
|
Err(anyhow!("Navigation timeout"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_rotated_index(&self) -> Result<usize> {
|
||||||
|
let total = self.instances.len();
|
||||||
|
let half_size = total / 2;
|
||||||
|
|
||||||
|
if half_size == 0 {
|
||||||
|
return Ok(0); // Pool zu klein für Rotation
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut next_idx = self.next_instance.lock().await;
|
||||||
|
let current_half_start = if *next_idx < half_size { 0 } else { half_size };
|
||||||
|
let current_half_end = if *next_idx < half_size { half_size } else { total };
|
||||||
|
|
||||||
|
// Suche verfügbare Instanz in aktueller Hälfte
|
||||||
|
for offset in 0..(current_half_end - current_half_start) {
|
||||||
|
let candidate_idx = current_half_start + ((*next_idx + offset) % half_size);
|
||||||
|
|
||||||
|
let instance = &self.instances[candidate_idx];
|
||||||
|
let guard = instance.lock().await;
|
||||||
|
|
||||||
|
if guard.max_tasks_per_instance == 0 ||
|
||||||
|
guard.task_count < guard.max_tasks_per_instance {
|
||||||
|
*next_idx = (candidate_idx + 1) % total;
|
||||||
|
drop(guard);
|
||||||
|
return Ok(candidate_idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aktuelle Hälfte voll → Zur anderen wechseln
|
||||||
|
crate::util::logger::log_info("Current half saturated, rotating to other half").await;
|
||||||
|
|
||||||
|
let new_half_start = if current_half_start == 0 { half_size } else { 0 };
|
||||||
|
let new_half_end = if current_half_start == 0 { total } else { half_size };
|
||||||
|
|
||||||
|
// Alte Hälfte zurücksetzen (für nächste Rotation)
|
||||||
|
for i in current_half_start..current_half_end {
|
||||||
|
let mut instance = self.instances[i].lock().await;
|
||||||
|
instance.reset_task_count();
|
||||||
|
}
|
||||||
|
|
||||||
|
*next_idx = new_half_start;
|
||||||
|
drop(next_idx);
|
||||||
|
|
||||||
|
Ok(new_half_start)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
||||||
@@ -273,12 +366,28 @@ pub struct ChromeInstance {
|
|||||||
task_count: usize,
|
task_count: usize,
|
||||||
max_tasks_per_instance: usize,
|
max_tasks_per_instance: usize,
|
||||||
proxy_url: Option<String>,
|
proxy_url: Option<String>,
|
||||||
|
|
||||||
|
current_session: Arc<Mutex<Option<Client>>>, // Current active session
|
||||||
|
session_request_count: Arc<Mutex<usize>>,
|
||||||
|
max_requests_per_session: usize, // z.B. 25
|
||||||
|
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool
|
||||||
|
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
|
||||||
|
|
||||||
|
instance_id: usize,
|
||||||
|
monitoring: Option<crate::monitoring::MonitoringHandle>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeInstance {
|
impl ChromeInstance {
|
||||||
pub async fn new(proxy_url: Option<String>, max_tasks_per_instance: usize) -> Result<Self> {
|
pub async fn new(proxy_pool: Option<Arc<DockerVpnProxyPool>>, instance_id: usize, config: &Config, monitoring: Option<crate::monitoring::MonitoringHandle>) -> Result<Self> {
|
||||||
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
||||||
|
|
||||||
|
// Get proxy URL if proxy pool is provided
|
||||||
|
let proxy_url = proxy_pool.as_ref().map(|pp| pp.get_proxy_url(instance_id));
|
||||||
|
|
||||||
|
let max_tasks_per_instance = config.max_tasks_per_instance;
|
||||||
|
let max_requests_per_session = config.max_requests_per_session;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
base_url,
|
base_url,
|
||||||
process,
|
process,
|
||||||
@@ -286,17 +395,170 @@ impl ChromeInstance {
|
|||||||
task_count: 0,
|
task_count: 0,
|
||||||
max_tasks_per_instance,
|
max_tasks_per_instance,
|
||||||
proxy_url,
|
proxy_url,
|
||||||
|
|
||||||
|
current_session: Arc::new(Mutex::new(None)),
|
||||||
|
session_request_count: Arc::new(Mutex::new(0)),
|
||||||
|
max_requests_per_session,
|
||||||
|
|
||||||
|
proxy_pool,
|
||||||
|
current_proxy_index: Arc::new(Mutex::new(instance_id)),
|
||||||
|
|
||||||
|
instance_id,
|
||||||
|
monitoring,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn new_session(&self) -> Result<Client> {
|
pub async fn get_or_renew_session(&self) -> Result<Client> {
|
||||||
|
let mut session_opt = self.current_session.lock().await;
|
||||||
|
let mut request_count = self.session_request_count.lock().await;
|
||||||
|
|
||||||
|
let old_request_count = *request_count;
|
||||||
|
|
||||||
|
// Session erneuern wenn:
|
||||||
|
// 1. Keine Session vorhanden
|
||||||
|
// 2. Request-Limit erreicht
|
||||||
|
let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session;
|
||||||
|
|
||||||
|
if needs_renewal {
|
||||||
|
if let Some(ref mon) = self.monitoring {
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::InstanceStatusChanged {
|
||||||
|
instance_id: self.instance_id,
|
||||||
|
status: crate::monitoring::InstanceStatusChange::Renewing,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Alte Session schließen
|
||||||
|
if let Some(old_session) = session_opt.take() {
|
||||||
|
crate::util::logger::log_info("Closing old session").await;
|
||||||
|
let _ = old_session.close().await;
|
||||||
|
// Kurze Pause zwischen Sessions
|
||||||
|
let random_delay = random_range(500, 1000);
|
||||||
|
sleep(Duration::from_millis(random_delay)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Neue Session mit frischem User-Agent erstellen
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Creating new session (requests in last session: {})",
|
||||||
|
*request_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
let new_session = self.create_fresh_session().await?;
|
||||||
|
*session_opt = Some(new_session.clone());
|
||||||
|
*request_count = 0;
|
||||||
|
|
||||||
|
if let Some(ref mon) = self.monitoring {
|
||||||
|
let reason = if *request_count >= self.max_requests_per_session {
|
||||||
|
crate::monitoring::RenewalReason::RequestLimit
|
||||||
|
} else {
|
||||||
|
crate::monitoring::RenewalReason::TaskLimit
|
||||||
|
};
|
||||||
|
|
||||||
|
// Get updated proxy info
|
||||||
|
let new_proxy_info = if let Some(ref pp) = self.proxy_pool {
|
||||||
|
let proxy_idx = *self.current_proxy_index.lock().await;
|
||||||
|
pp.get_proxy_info(proxy_idx)
|
||||||
|
} else {
|
||||||
|
self.proxy_url.as_ref().and_then(|url| {
|
||||||
|
if let Some(port_str) = url.split(':').last() {
|
||||||
|
if let Ok(port) = port_str.parse::<u16>() {
|
||||||
|
return Some(crate::monitoring::ProxyInfo {
|
||||||
|
container_name: format!("proxy-{}", self.instance_id),
|
||||||
|
ip_address: "127.0.0.1".to_string(),
|
||||||
|
port,
|
||||||
|
status: crate::monitoring::ProxyStatus::Connected,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::SessionRenewed {
|
||||||
|
instance_id: self.instance_id,
|
||||||
|
old_request_count: *request_count,
|
||||||
|
reason: crate::monitoring::RenewalReason::RequestLimit,
|
||||||
|
new_proxy: new_proxy_info,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(new_session)
|
||||||
|
} else {
|
||||||
|
// Existierende Session verwenden
|
||||||
|
*request_count += 1;
|
||||||
|
Ok(session_opt.as_ref().unwrap().clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_fresh_session(&self) -> Result<Client> {
|
||||||
|
// Hole aktuellen Proxy-URL ohne self zu mutieren
|
||||||
|
let proxy_url = if let Some(ref pool) = self.proxy_pool {
|
||||||
|
let mut proxy_idx = self.current_proxy_index.lock().await;
|
||||||
|
*proxy_idx = (*proxy_idx + 1) % pool.num_proxies();
|
||||||
|
let url = pool.get_proxy_url(*proxy_idx);
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Using proxy {} for new session",
|
||||||
|
*proxy_idx
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
Some(url)
|
||||||
|
} else {
|
||||||
|
self.proxy_url.clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
let user_agent = Self::chrome_user_agent();
|
||||||
|
let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url);
|
||||||
|
|
||||||
ClientBuilder::native()
|
ClientBuilder::native()
|
||||||
.capabilities(self.chrome_args())
|
.capabilities(capabilities)
|
||||||
.connect(&self.base_url)
|
.connect(&self.base_url)
|
||||||
.await
|
.await
|
||||||
.context("Failed to connect to ChromeDriver")
|
.context("Failed to connect to ChromeDriver")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn chrome_args_with_ua(&self, user_agent: &str, proxy_url: &Option<String>) -> Map<String, Value> {
|
||||||
|
let mut args = vec![
|
||||||
|
"--headless=new".to_string(),
|
||||||
|
"--disable-gpu".to_string(),
|
||||||
|
"--no-sandbox".to_string(),
|
||||||
|
"--disable-dev-shm-usage".to_string(),
|
||||||
|
"--disable-infobars".to_string(),
|
||||||
|
"--disable-extensions".to_string(),
|
||||||
|
"--disable-popup-blocking".to_string(),
|
||||||
|
"--disable-notifications".to_string(),
|
||||||
|
"--disable-autofill".to_string(),
|
||||||
|
"--disable-sync".to_string(),
|
||||||
|
"--disable-default-apps".to_string(),
|
||||||
|
"--disable-translate".to_string(),
|
||||||
|
"--disable-blink-features=AutomationControlled".to_string(),
|
||||||
|
format!("--user-agent={}", user_agent),
|
||||||
|
];
|
||||||
|
|
||||||
|
if let Some(proxy) = proxy_url {
|
||||||
|
args.push(format!("--proxy-server={}", proxy));
|
||||||
|
}
|
||||||
|
|
||||||
|
let caps = serde_json::json!({
|
||||||
|
"goog:chromeOptions": {
|
||||||
|
"args": args,
|
||||||
|
"excludeSwitches": ["enable-logging", "enable-automation"],
|
||||||
|
"prefs": {
|
||||||
|
"profile.default_content_setting_values.notifications": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
caps.as_object().cloned().unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reset_task_count(&mut self) {
|
||||||
|
self.task_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_session_stats(&self) -> (usize, usize) {
|
||||||
|
let request_count = *self.session_request_count.lock().await;
|
||||||
|
(self.task_count, request_count)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn increment_task_count(&mut self) {
|
pub fn increment_task_count(&mut self) {
|
||||||
self.task_count += 1;
|
self.task_count += 1;
|
||||||
}
|
}
|
||||||
@@ -362,40 +624,15 @@ impl ChromeInstance {
|
|||||||
Err(anyhow!("ChromeDriver failed to start within 30s"))
|
Err(anyhow!("ChromeDriver failed to start within 30s"))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn chrome_args(&self) -> Map<String, Value> {
|
pub fn chrome_user_agent() -> &'static str {
|
||||||
let mut args = vec![
|
static UAS: &[&str] = &[
|
||||||
"--headless=new".to_string(),
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
|
||||||
"--disable-gpu".to_string(),
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36",
|
||||||
"--no-sandbox".to_string(),
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
|
||||||
"--disable-dev-shm-usage".to_string(),
|
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
|
||||||
"--disable-infobars".to_string(),
|
|
||||||
"--disable-extensions".to_string(),
|
|
||||||
"--disable-popup-blocking".to_string(),
|
|
||||||
"--disable-notifications".to_string(),
|
|
||||||
"--disable-logging".to_string(),
|
|
||||||
"--disable-autofill".to_string(),
|
|
||||||
"--disable-sync".to_string(),
|
|
||||||
"--disable-default-apps".to_string(),
|
|
||||||
"--disable-translate".to_string(),
|
|
||||||
"--window-size=1920,1080".to_string(),
|
|
||||||
"--disable-blink-features=AutomationControlled".to_string(),
|
|
||||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36".to_string()
|
|
||||||
];
|
];
|
||||||
if let Some(ref proxy) = self.proxy_url {
|
let random_user_agent = choose_random(UAS);
|
||||||
let proxy = proxy.clone();
|
random_user_agent
|
||||||
let proxy_formatted = format!("--proxy-server={}", proxy);
|
|
||||||
args.push(proxy_formatted);
|
|
||||||
}
|
|
||||||
let caps = serde_json::json!({
|
|
||||||
"goog:chromeOptions": {
|
|
||||||
"args": args,
|
|
||||||
"excludeSwitches": ["enable-logging", "enable-automation"],
|
|
||||||
"prefs": {
|
|
||||||
"profile.default_content_setting_values.notifications": 2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
caps.as_object().cloned().unwrap()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ use tokio::sync::Mutex;
|
|||||||
use std::fs::{self, OpenOptions};
|
use std::fs::{self, OpenOptions};
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
static LOGGER: Lazy<Mutex<Option<DebugLogger>>> = Lazy::new(|| Mutex::new(None));
|
static LOGGER: Lazy<Mutex<Option<DebugLogger>>> = Lazy::new(|| Mutex::new(None));
|
||||||
|
|
||||||
@@ -76,3 +78,83 @@ pub async fn log_warn(msg: &str) {
|
|||||||
pub async fn log_error(msg: &str) {
|
pub async fn log_error(msg: &str) {
|
||||||
log_detailed("ERROR", msg).await;
|
log_detailed("ERROR", msg).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct PoolLogger {
|
||||||
|
file: std::fs::File,
|
||||||
|
log_path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PoolLogger {
|
||||||
|
fn new(log_dir: &std::path::Path) -> std::io::Result<Self> {
|
||||||
|
fs::create_dir_all(log_dir)?;
|
||||||
|
let filename = format!("webdriver_{}.log", Local::now().format("%Y%m%d_%H%M%S"));
|
||||||
|
let log_path = log_dir.join(&filename);
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)?;
|
||||||
|
Ok(Self { file, log_path })
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log(&mut self, msg: &str) {
|
||||||
|
let line = format!("[{}] {}\n", Local::now().format("%H:%M:%S"), msg);
|
||||||
|
let _ = self.file.write_all(line.as_bytes());
|
||||||
|
let _ = self.file.flush();
|
||||||
|
println!("{}", line.trim_end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PoolMetrics {
|
||||||
|
pub total_requests: Arc<AtomicUsize>,
|
||||||
|
pub successful_requests: Arc<AtomicUsize>,
|
||||||
|
pub failed_requests: Arc<AtomicUsize>,
|
||||||
|
pub session_renewals: Arc<AtomicUsize>,
|
||||||
|
pub rotation_events: Arc<AtomicUsize>,
|
||||||
|
pub retries: Arc<AtomicUsize>,
|
||||||
|
|
||||||
|
// IMPROVEMENT: Neue Metriken für besseres Monitoring
|
||||||
|
pub navigation_timeouts: Arc<AtomicUsize>,
|
||||||
|
pub bot_detection_hits: Arc<AtomicUsize>,
|
||||||
|
pub proxy_failures: Arc<AtomicUsize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PoolMetrics {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
total_requests: Arc::new(AtomicUsize::new(0)),
|
||||||
|
successful_requests: Arc::new(AtomicUsize::new(0)),
|
||||||
|
failed_requests: Arc::new(AtomicUsize::new(0)),
|
||||||
|
session_renewals: Arc::new(AtomicUsize::new(0)),
|
||||||
|
rotation_events: Arc::new(AtomicUsize::new(0)),
|
||||||
|
retries: Arc::new(AtomicUsize::new(0)),
|
||||||
|
navigation_timeouts: Arc::new(AtomicUsize::new(0)),
|
||||||
|
bot_detection_hits: Arc::new(AtomicUsize::new(0)),
|
||||||
|
proxy_failures: Arc::new(AtomicUsize::new(0)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_stats(&self) {
|
||||||
|
let total = self.total_requests.load(Ordering::Relaxed);
|
||||||
|
let success = self.successful_requests.load(Ordering::Relaxed);
|
||||||
|
// FIX: Prefix unused variable with underscore
|
||||||
|
let _failed = self.failed_requests.load(Ordering::Relaxed);
|
||||||
|
let renewals = self.session_renewals.load(Ordering::Relaxed);
|
||||||
|
let rotations = self.rotation_events.load(Ordering::Relaxed);
|
||||||
|
let retries = self.retries.load(Ordering::Relaxed);
|
||||||
|
let timeouts = self.navigation_timeouts.load(Ordering::Relaxed);
|
||||||
|
let bot_hits = self.bot_detection_hits.load(Ordering::Relaxed);
|
||||||
|
let proxy_fails = self.proxy_failures.load(Ordering::Relaxed);
|
||||||
|
|
||||||
|
let success_rate = if total > 0 {
|
||||||
|
(success as f64 / total as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Pool Metrics: {} total requests, {:.1}% success rate, {} renewals, {} rotations, {} retries, {} timeouts, {} bot detections, {} proxy failures",
|
||||||
|
total, success_rate, renewals, rotations, retries, timeouts, bot_hits, proxy_fails
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user