Compare commits
55 Commits
b0a471ea84
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| eff1412c0f | |||
| 75ab1969c7 | |||
| f4b20f824d | |||
| 93fbefc9d4 | |||
| 4ea0c78d3d | |||
| 1d025a04ce | |||
| 98e1bca12f | |||
| 29d8f1d89e | |||
| c0c9bc0ed9 | |||
| 659757482d | |||
| bd74f36f4c | |||
| e6f8393660 | |||
| aff340ee2f | |||
| 0487c2ec49 | |||
| 04f4b0d0c4 | |||
| 6f05dc8c99 | |||
| ac1345798d | |||
| 766eb803f1 | |||
| 151c96e35f | |||
| ae1876b014 | |||
| c86d828940 | |||
| c6d301d434 | |||
| ba841248f0 | |||
| 8dd75f7bdf | |||
| ea128f6187 | |||
| 1720716144 | |||
| f9ce5bad99 | |||
| fc25f32cbc | |||
| 3d16475b79 | |||
| 86944a9c58 | |||
| f9f09d0291 | |||
| fb0876309f | |||
| c01b47000f | |||
| 5e81959322 | |||
| b366f366e6 | |||
| cd91de253b | |||
| c51b36c125 | |||
| 9c66f0d361 | |||
| d26e833d93 | |||
| d744769138 | |||
| 00c9d45642 | |||
| 1bda78897b | |||
| 470f0922ed | |||
| c9da56e8e9 | |||
| dde859b071 | |||
| 2416947e9d | |||
| 3ab5d0dcc3 | |||
| c2408d9a56 | |||
| f95e9e2427 | |||
| c00bfd8687 | |||
| 0f89c8c0ce | |||
| a6823dc938 | |||
| 58a498e694 | |||
| f7083bf9f0 | |||
| f05df0b5ee |
42
.env.example
Normal file
42
.env.example
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# WebScraper Configuration File (.env)
|
||||||
|
# ====================================
|
||||||
|
# This file configures the behavior of the WebScraper application
|
||||||
|
# Copy to .env and adjust values as needed
|
||||||
|
|
||||||
|
OPENFIGI_API_KEY=
|
||||||
|
|
||||||
|
# Economic calendar start (usually the earliest available on finanzen.net)
|
||||||
|
ECONOMIC_START_DATE=2007-02-13
|
||||||
|
|
||||||
|
# Corporate earnings & price history start
|
||||||
|
CORPORATE_START_DATE=2010-01-01
|
||||||
|
|
||||||
|
# How far into the future we scrape economic events (in months)
|
||||||
|
ECONOMIC_LOOKAHEAD_MONTHS=3
|
||||||
|
|
||||||
|
# Maximum number of parallel scraping tasks (default: 4)
|
||||||
|
MAX_PARALLEL_INSTANCES=10
|
||||||
|
|
||||||
|
# ===== VPN ROTATION (ProtonVPN Integration) =====
|
||||||
|
# Enable automatic VPN rotation between sessions?
|
||||||
|
# If false, all traffic goes through system without VPN tunneling
|
||||||
|
ENABLE_VPN_ROTATION=true
|
||||||
|
|
||||||
|
# Number of tasks per VPN session before rotating to new server/IP
|
||||||
|
# 0 = rotate between economic and corporate phases (one phase = one IP)
|
||||||
|
# 5 = rotate every 5 tasks
|
||||||
|
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
||||||
|
TASKS_PER_VPN_SESSION=50
|
||||||
|
|
||||||
|
# ===== LOGGING =====
|
||||||
|
# Set via RUST_LOG environment variable:
|
||||||
|
# RUST_LOG=info cargo run
|
||||||
|
# RUST_LOG=debug cargo run
|
||||||
|
# Leave empty or unset for default logging level
|
||||||
|
|
||||||
|
|
||||||
|
MAX_REQUESTS_PER_SESSION=25
|
||||||
|
MIN_REQUEST_INTERVAL_MS=300
|
||||||
|
MAX_RETRY_ATTEMPTS=3
|
||||||
|
|
||||||
|
PROXY_INSTANCES_PER_CERTIFICATE=2
|
||||||
23
.gitignore
vendored
23
.gitignore
vendored
@@ -27,10 +27,19 @@ target/
|
|||||||
|
|
||||||
# /chromedriver-win64/*
|
# /chromedriver-win64/*
|
||||||
|
|
||||||
# data folders
|
# data files
|
||||||
/economic_events*
|
**/*.json
|
||||||
/economic_event_changes*
|
**/*.jsonl
|
||||||
/corporate_events*
|
**/*.csv
|
||||||
/corporate_prices*
|
**/*.zip
|
||||||
/corporate_event_changes*
|
**/*.log
|
||||||
/data*
|
**/*.ovpn
|
||||||
|
**/*.tmp
|
||||||
|
**/*.txt
|
||||||
|
|
||||||
|
#/economic_events*
|
||||||
|
#/economic_event_changes*
|
||||||
|
#/corporate_events*
|
||||||
|
#/corporate_prices*
|
||||||
|
#/corporate_event_changes*
|
||||||
|
#/data*
|
||||||
278
Cargo.lock
generated
278
Cargo.lock
generated
@@ -110,6 +110,17 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-trait"
|
||||||
|
version = "0.1.89"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.110",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "atomic-waker"
|
name = "atomic-waker"
|
||||||
version = "1.1.2"
|
version = "1.1.2"
|
||||||
@@ -122,6 +133,64 @@ version = "1.5.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "axum"
|
||||||
|
version = "0.7.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||||
|
dependencies = [
|
||||||
|
"async-trait",
|
||||||
|
"axum-core",
|
||||||
|
"base64 0.22.1",
|
||||||
|
"bytes",
|
||||||
|
"futures-util",
|
||||||
|
"http 1.3.1",
|
||||||
|
"http-body 1.0.1",
|
||||||
|
"http-body-util",
|
||||||
|
"hyper 1.8.1",
|
||||||
|
"hyper-util",
|
||||||
|
"itoa",
|
||||||
|
"matchit",
|
||||||
|
"memchr",
|
||||||
|
"mime",
|
||||||
|
"percent-encoding",
|
||||||
|
"pin-project-lite",
|
||||||
|
"rustversion",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"serde_path_to_error",
|
||||||
|
"serde_urlencoded",
|
||||||
|
"sha1",
|
||||||
|
"sync_wrapper",
|
||||||
|
"tokio",
|
||||||
|
"tokio-tungstenite 0.24.0",
|
||||||
|
"tower",
|
||||||
|
"tower-layer",
|
||||||
|
"tower-service",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "axum-core"
|
||||||
|
version = "0.4.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||||
|
dependencies = [
|
||||||
|
"async-trait",
|
||||||
|
"bytes",
|
||||||
|
"futures-util",
|
||||||
|
"http 1.3.1",
|
||||||
|
"http-body 1.0.1",
|
||||||
|
"http-body-util",
|
||||||
|
"mime",
|
||||||
|
"pin-project-lite",
|
||||||
|
"rustversion",
|
||||||
|
"sync_wrapper",
|
||||||
|
"tower-layer",
|
||||||
|
"tower-service",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base64"
|
name = "base64"
|
||||||
version = "0.21.7"
|
version = "0.21.7"
|
||||||
@@ -660,31 +729,6 @@ dependencies = [
|
|||||||
"windows-sys 0.61.2",
|
"windows-sys 0.61.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "event_backtest_engine"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"chrono",
|
|
||||||
"csv",
|
|
||||||
"dotenvy",
|
|
||||||
"fantoccini",
|
|
||||||
"flate2",
|
|
||||||
"futures",
|
|
||||||
"rand 0.9.2",
|
|
||||||
"rayon",
|
|
||||||
"reqwest",
|
|
||||||
"scraper",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"tokio",
|
|
||||||
"toml",
|
|
||||||
"tracing",
|
|
||||||
"tracing-subscriber",
|
|
||||||
"yfinance-rs",
|
|
||||||
"zip",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fantoccini"
|
name = "fantoccini"
|
||||||
version = "0.20.0"
|
version = "0.20.0"
|
||||||
@@ -1096,6 +1140,7 @@ dependencies = [
|
|||||||
"http 1.3.1",
|
"http 1.3.1",
|
||||||
"http-body 1.0.1",
|
"http-body 1.0.1",
|
||||||
"httparse",
|
"httparse",
|
||||||
|
"httpdate",
|
||||||
"itoa",
|
"itoa",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"pin-utils",
|
"pin-utils",
|
||||||
@@ -1519,6 +1564,12 @@ dependencies = [
|
|||||||
"regex-automata",
|
"regex-automata",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "matchit"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.7.6"
|
version = "2.7.6"
|
||||||
@@ -2414,9 +2465,9 @@ checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "1.1.2"
|
version = "1.1.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
|
checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags",
|
"bitflags",
|
||||||
"errno",
|
"errno",
|
||||||
@@ -2527,6 +2578,15 @@ version = "1.0.20"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "same-file"
|
||||||
|
version = "1.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "schannel"
|
name = "schannel"
|
||||||
version = "0.1.28"
|
version = "0.1.28"
|
||||||
@@ -2673,10 +2733,21 @@ dependencies = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_spanned"
|
name = "serde_path_to_error"
|
||||||
version = "1.0.3"
|
version = "0.1.20"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e24345aa0fe688594e73770a5f6d1b216508b4f93484c0026d521acd30134392"
|
checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457"
|
||||||
|
dependencies = [
|
||||||
|
"itoa",
|
||||||
|
"serde",
|
||||||
|
"serde_core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_spanned"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_core",
|
"serde_core",
|
||||||
]
|
]
|
||||||
@@ -2912,9 +2983,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tempfile"
|
name = "tempfile"
|
||||||
version = "3.23.0"
|
version = "3.24.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
|
checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fastrand",
|
"fastrand",
|
||||||
"getrandom 0.3.4",
|
"getrandom 0.3.4",
|
||||||
@@ -3097,6 +3168,30 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokio-tungstenite"
|
||||||
|
version = "0.21.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
|
||||||
|
dependencies = [
|
||||||
|
"futures-util",
|
||||||
|
"log",
|
||||||
|
"tokio",
|
||||||
|
"tungstenite 0.21.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokio-tungstenite"
|
||||||
|
version = "0.24.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
|
||||||
|
dependencies = [
|
||||||
|
"futures-util",
|
||||||
|
"log",
|
||||||
|
"tokio",
|
||||||
|
"tungstenite 0.24.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio-tungstenite"
|
name = "tokio-tungstenite"
|
||||||
version = "0.28.0"
|
version = "0.28.0"
|
||||||
@@ -3110,7 +3205,7 @@ dependencies = [
|
|||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls 0.26.4",
|
"tokio-rustls 0.26.4",
|
||||||
"tungstenite",
|
"tungstenite 0.28.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3128,9 +3223,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml"
|
name = "toml"
|
||||||
version = "0.9.8"
|
version = "0.9.11+spec-1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f0dc8b1fb61449e27716ec0e1bdf0f6b8f3e8f6b05391e8497b8b6d7804ea6d8"
|
checksum = "f3afc9a848309fe1aaffaed6e1546a7a14de1f935dc9d89d32afd9a44bab7c46"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"serde_core",
|
"serde_core",
|
||||||
@@ -3143,9 +3238,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml_datetime"
|
name = "toml_datetime"
|
||||||
version = "0.7.3"
|
version = "0.7.5+spec-1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533"
|
checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_core",
|
"serde_core",
|
||||||
]
|
]
|
||||||
@@ -3164,18 +3259,18 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml_parser"
|
name = "toml_parser"
|
||||||
version = "1.0.4"
|
version = "1.0.6+spec-1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e"
|
checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"winnow",
|
"winnow",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml_writer"
|
name = "toml_writer"
|
||||||
version = "1.0.4"
|
version = "1.0.6+spec-1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2"
|
checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tower"
|
name = "tower"
|
||||||
@@ -3190,6 +3285,7 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tower-layer",
|
"tower-layer",
|
||||||
"tower-service",
|
"tower-service",
|
||||||
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3228,6 +3324,7 @@ version = "0.1.41"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"log",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"tracing-attributes",
|
"tracing-attributes",
|
||||||
"tracing-core",
|
"tracing-core",
|
||||||
@@ -3289,6 +3386,43 @@ version = "0.2.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tungstenite"
|
||||||
|
version = "0.21.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"bytes",
|
||||||
|
"data-encoding",
|
||||||
|
"http 1.3.1",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"sha1",
|
||||||
|
"thiserror 1.0.69",
|
||||||
|
"url",
|
||||||
|
"utf-8",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tungstenite"
|
||||||
|
version = "0.24.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"bytes",
|
||||||
|
"data-encoding",
|
||||||
|
"http 1.3.1",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"sha1",
|
||||||
|
"thiserror 1.0.69",
|
||||||
|
"utf-8",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tungstenite"
|
name = "tungstenite"
|
||||||
version = "0.28.0"
|
version = "0.28.0"
|
||||||
@@ -3350,6 +3484,12 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "urlencoding"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf-8"
|
name = "utf-8"
|
||||||
version = "0.7.6"
|
version = "0.7.6"
|
||||||
@@ -3368,6 +3508,7 @@ version = "1.18.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
|
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"getrandom 0.3.4",
|
||||||
"js-sys",
|
"js-sys",
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
@@ -3390,6 +3531,16 @@ version = "0.9.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "walkdir"
|
||||||
|
version = "2.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
|
||||||
|
dependencies = [
|
||||||
|
"same-file",
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "want"
|
name = "want"
|
||||||
version = "0.3.1"
|
version = "0.3.1"
|
||||||
@@ -3492,6 +3643,40 @@ dependencies = [
|
|||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "web_scraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"axum",
|
||||||
|
"chrono",
|
||||||
|
"csv",
|
||||||
|
"dotenvy",
|
||||||
|
"fantoccini",
|
||||||
|
"flate2",
|
||||||
|
"futures",
|
||||||
|
"once_cell",
|
||||||
|
"rand 0.9.2",
|
||||||
|
"rayon",
|
||||||
|
"regex",
|
||||||
|
"reqwest",
|
||||||
|
"scraper",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"sha2",
|
||||||
|
"tokio",
|
||||||
|
"tokio-tungstenite 0.21.0",
|
||||||
|
"toml",
|
||||||
|
"tracing",
|
||||||
|
"tracing-subscriber",
|
||||||
|
"url",
|
||||||
|
"urlencoding",
|
||||||
|
"uuid",
|
||||||
|
"walkdir",
|
||||||
|
"yfinance-rs",
|
||||||
|
"zip",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webdriver"
|
name = "webdriver"
|
||||||
version = "0.50.0"
|
version = "0.50.0"
|
||||||
@@ -3521,6 +3706,15 @@ dependencies = [
|
|||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-util"
|
||||||
|
version = "0.1.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-core"
|
name = "windows-core"
|
||||||
version = "0.62.2"
|
version = "0.62.2"
|
||||||
@@ -3797,7 +3991,7 @@ dependencies = [
|
|||||||
"serde_json",
|
"serde_json",
|
||||||
"thiserror 2.0.17",
|
"thiserror 2.0.17",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-tungstenite",
|
"tokio-tungstenite 0.28.0",
|
||||||
"url",
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
24
Cargo.toml
24
Cargo.toml
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "event_backtest_engine"
|
name = "web_scraper"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2024"
|
||||||
authors = ["Your Name <you@example.com>"]
|
authors = ["Your Name <you@example.com>"]
|
||||||
description = "High-impact economic & corporate earnings data collector for short-event backtesting (overnight/weekend gaps)"
|
description = "High-impact economic & corporate earnings data collector for short-event backtesting (overnight/weekend gaps)"
|
||||||
license = "MIT OR Apache-2.0"
|
license = "MIT OR Apache-2.0"
|
||||||
@@ -17,10 +17,12 @@ categories = ["finance", "data-structures", "asynchronous"]
|
|||||||
tokio = { version = "1.38", features = ["full"] }
|
tokio = { version = "1.38", features = ["full"] }
|
||||||
|
|
||||||
# Web scraping & HTTP
|
# Web scraping & HTTP
|
||||||
reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking"] }
|
reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking", "socks", "cookies"] }
|
||||||
scraper = "0.19" # HTML parsing for Yahoo earnings pages
|
scraper = "0.19" # HTML parsing for Yahoo earnings pages
|
||||||
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
|
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
|
||||||
yfinance-rs = "0.7.2"
|
yfinance-rs = "0.7.2"
|
||||||
|
url = "2.5.7"
|
||||||
|
urlencoding = "2.1"
|
||||||
|
|
||||||
# Serialization
|
# Serialization
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
@@ -29,6 +31,10 @@ csv = "1.3"
|
|||||||
zip = "6.0.0"
|
zip = "6.0.0"
|
||||||
flate2 = "1.1.5"
|
flate2 = "1.1.5"
|
||||||
|
|
||||||
|
# Formatting
|
||||||
|
regex = "1.12.2"
|
||||||
|
walkdir = "2"
|
||||||
|
|
||||||
# Generating
|
# Generating
|
||||||
rand = "0.9.2"
|
rand = "0.9.2"
|
||||||
|
|
||||||
@@ -45,7 +51,19 @@ anyhow = "1.0"
|
|||||||
# Logging (optional but recommended)
|
# Logging (optional but recommended)
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||||
|
once_cell = "1.21.3"
|
||||||
|
|
||||||
# Parallel processing (for batch tickers)
|
# Parallel processing (for batch tickers)
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
rayon = "1.10" # optional: for parallel price downloads
|
rayon = "1.10" # optional: for parallel price downloads
|
||||||
|
|
||||||
|
# Web server for dashboard
|
||||||
|
axum = { version = "0.7", features = ["ws"] }
|
||||||
|
tokio-tungstenite = "0.21" # For WebSocket support
|
||||||
|
|
||||||
|
# tests
|
||||||
|
#tempfile = "3.24.0"
|
||||||
|
|
||||||
|
# data integrity
|
||||||
|
sha2 = "0.10.9"
|
||||||
|
uuid = { version = "1.0", features = ["v4", "v7"] }
|
||||||
@@ -249,3 +249,7 @@ Der Scraper unterstützt 52 Länder und Regionen (siehe `countries.json`), darun
|
|||||||
|
|
||||||
https://chromedriver.storage.googleapis.com/index.html
|
https://chromedriver.storage.googleapis.com/index.html
|
||||||
https://googlechromelabs.github.io/chrome-for-testing/
|
https://googlechromelabs.github.io/chrome-for-testing/
|
||||||
|
|
||||||
|
## Gaphviz.org Download
|
||||||
|
|
||||||
|
https://graphviz.org/download/
|
||||||
15
cache/openfigi/INFO.md
vendored
Normal file
15
cache/openfigi/INFO.md
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Openfigi Data
|
||||||
|
|
||||||
|
## Market Security Description
|
||||||
|
| Code | Meaning |
|
||||||
|
| ---------- | --------------------------------------------------------- |
|
||||||
|
| **Comdty** | Commodity (e.g., oil, gold futures, physical commodities) |
|
||||||
|
| **Corp** | Corporate bond / corporate debt security |
|
||||||
|
| **Curncy** | Currency or FX pair (e.g., EURUSD) |
|
||||||
|
| **Equity** | Stocks / shares |
|
||||||
|
| **Govt** | Government bond (Treasuries, Bunds, Gilts, etc.) |
|
||||||
|
| **Index** | Market indices (S&P 500, DAX, NYSE Composite…) |
|
||||||
|
| **M-Mkt** | Money market instruments (commercial paper, CDs, T-bills) |
|
||||||
|
| **Mtge** | Mortgage-backed securities (MBS) |
|
||||||
|
| **Muni** | Municipal bonds (US state/local government debt) |
|
||||||
|
| **Pfd** | Preferred shares |
|
||||||
15
data/INFO.md
Normal file
15
data/INFO.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Global Data Info
|
||||||
|
|
||||||
|
## Exchanges
|
||||||
|
|
||||||
|
Source: Wikipedia
|
||||||
|
|
||||||
|
## Gleif
|
||||||
|
|
||||||
|
Data Download [.zip] over Website
|
||||||
|
|
||||||
|
## OpenFigi
|
||||||
|
|
||||||
|
Data Scraping over open API
|
||||||
|
|
||||||
|
Api Key: .env
|
||||||
6
data/economic/INFO.md
Normal file
6
data/economic/INFO.md
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# Economic Info
|
||||||
|
|
||||||
|
## Sources
|
||||||
|
|
||||||
|
* continents: finanzen.net
|
||||||
|
* countries: finanzen.net
|
||||||
25
data_updating_rule.md
Normal file
25
data_updating_rule.md
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Abort-Safe Incremental JSONL Persistence Rule
|
||||||
|
|
||||||
|
**Rule:** Persist state using an *append-only, fsync-backed JSONL log with atomic checkpoints*.
|
||||||
|
|
||||||
|
**Requirements**
|
||||||
|
- Write updates as **single-line JSON objects** (one logical mutation per line).
|
||||||
|
- **Append only** (`O_APPEND`), never modify existing lines.
|
||||||
|
- After each write batch, call **`fsync`** (or `File::sync_data`) before reporting success.
|
||||||
|
- Treat a **line as committed only if it ends with `\n`**; ignore trailing partial lines on recovery.
|
||||||
|
- Periodically create a **checkpoint**:
|
||||||
|
- Write full state to `state.tmp`
|
||||||
|
- `fsync`
|
||||||
|
- **Atomic rename** to `state.jsonl`
|
||||||
|
- On startup:
|
||||||
|
- Load last checkpoint
|
||||||
|
- Replay log lines after it in order
|
||||||
|
- On abort/panic/crash:
|
||||||
|
- No truncation
|
||||||
|
- Replay guarantees no data loss beyond last fsynced line
|
||||||
|
|
||||||
|
**Outcome**
|
||||||
|
- Crash/abort-safe
|
||||||
|
- O(1) writes
|
||||||
|
- Deterministic recovery
|
||||||
|
- Minimal overhead
|
||||||
BIN
event_backtest_engine.exe
Normal file
BIN
event_backtest_engine.exe
Normal file
Binary file not shown.
28
integrity/checkpoint_dependencies.dot
Normal file
28
integrity/checkpoint_dependencies.dot
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
digraph Dependencies {
|
||||||
|
rankdir=LR;
|
||||||
|
node [shape=box];
|
||||||
|
|
||||||
|
"yahoo_options_enrichment_complete" [label="yahoo_options_enrichment_complete
|
||||||
|
Options data enriched for all companies"];
|
||||||
|
"yahoo_events_enrichment_complete" [label="yahoo_events_enrichment_complete
|
||||||
|
Corporate events enriched for all companies"];
|
||||||
|
"yahoo_companies_cleansed_no_data" [label="yahoo_companies_cleansed_no_data
|
||||||
|
Companies cleansed of data with no Yahoo results"];
|
||||||
|
"yahoo_chart_enrichment_complete" [label="yahoo_chart_enrichment_complete
|
||||||
|
Chart data enriched for all companies"];
|
||||||
|
"enrichment_group" [label="enrichment_group
|
||||||
|
Yahoo exchanges collected and validated"];
|
||||||
|
"yahoo_companies_cleansed_low_profile" [label="yahoo_companies_cleansed_low_profile
|
||||||
|
Companies cleansed of low profile (insufficient market cap/price data)"];
|
||||||
|
"lei_figi_mapping_complete" [label="lei_figi_mapping_complete
|
||||||
|
LEI-to-FIGI mappings from OpenFIGI API"];
|
||||||
|
"securities_data_complete" [label="securities_data_complete
|
||||||
|
Securities data built from FIGI mappings"];
|
||||||
|
|
||||||
|
"yahoo_options_enrichment_complete" -> "yahoo_companies_cleansed_low_profile" [label="via group enrichment_group"];
|
||||||
|
"yahoo_events_enrichment_complete" -> "yahoo_companies_cleansed_low_profile" [label="via group enrichment_group"];
|
||||||
|
"yahoo_companies_cleansed_no_data" -> "securities_data_complete";
|
||||||
|
"yahoo_chart_enrichment_complete" -> "yahoo_companies_cleansed_low_profile" [label="via group enrichment_group"];
|
||||||
|
"yahoo_companies_cleansed_low_profile" -> "yahoo_companies_cleansed_no_data";
|
||||||
|
"securities_data_complete" -> "lei_figi_mapping_complete";
|
||||||
|
}
|
||||||
61
integrity/checkpoint_dependencies.toml
Normal file
61
integrity/checkpoint_dependencies.toml
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# checkpoint_dependencies.toml - Complete configuration
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# COLLECTION STAGE (No dependencies)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
[checkpoints.lei_figi_mapping_complete]
|
||||||
|
description = "LEI-to-FIGI mappings from OpenFIGI API"
|
||||||
|
depends_on = []
|
||||||
|
|
||||||
|
[checkpoints.securities_data_complete]
|
||||||
|
description = "Securities data built from FIGI mappings"
|
||||||
|
depends_on = ["lei_figi_mapping_complete"]
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CLEANSING STAGE (Depends on collection)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
[checkpoints.yahoo_companies_cleansed_no_data]
|
||||||
|
description = "Companies cleansed of data with no Yahoo results"
|
||||||
|
depends_on = ["securities_data_complete"]
|
||||||
|
|
||||||
|
[checkpoints.yahoo_companies_cleansed_low_profile]
|
||||||
|
description = "Companies cleansed of low profile (insufficient market cap/price data)"
|
||||||
|
depends_on = ["yahoo_companies_cleansed_no_data"]
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# ENRICHMENT GROUP (All depend on cleansed companies)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
[groups.enrichment_group]
|
||||||
|
description = "Yahoo Finance enrichment functions"
|
||||||
|
members = [
|
||||||
|
"yahoo_events_enrichment_complete",
|
||||||
|
"yahoo_options_enrichment_complete",
|
||||||
|
"yahoo_chart_enrichment_complete"
|
||||||
|
]
|
||||||
|
depends_on = ["yahoo_companies_cleansed_low_profile"]
|
||||||
|
|
||||||
|
[checkpoints.yahoo_events_enrichment_complete]
|
||||||
|
description = "Corporate events enriched for all companies"
|
||||||
|
depends_on = []
|
||||||
|
group = "enrichment_group"
|
||||||
|
|
||||||
|
[checkpoints.yahoo_options_enrichment_complete]
|
||||||
|
description = "Options data enriched for all companies"
|
||||||
|
depends_on = []
|
||||||
|
group = "enrichment_group"
|
||||||
|
|
||||||
|
[checkpoints.yahoo_chart_enrichment_complete]
|
||||||
|
description = "Chart data enriched for all companies"
|
||||||
|
depends_on = []
|
||||||
|
group = "enrichment_group"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# SECURITIES PROCESSING (Depends on LEI mapping)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
[checkpoints.enrichment_group]
|
||||||
|
description = "Yahoo exchanges collected and validated"
|
||||||
|
depends_on = []
|
||||||
127
src/config.rs
127
src/config.rs
@@ -1,23 +1,35 @@
|
|||||||
|
// src/config.rs - FIXED VERSION
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::{self};
|
use chrono::{self};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct Config {
|
pub struct Config {
|
||||||
// Economic calendar start (usually the earliest available on finanzen.net)
|
pub economic_start_date: String,
|
||||||
pub economic_start_date: String, // e.g. "2007-02-13"
|
pub corporate_start_date: String,
|
||||||
// Corporate earnings & price history start
|
pub economic_lookahead_months: u32,
|
||||||
pub corporate_start_date: String, // e.g. "2000-01-01" or "2010-01-01"
|
|
||||||
// How far into the future we scrape economic events
|
|
||||||
pub economic_lookahead_months: u32, // default: 3
|
|
||||||
/// Maximum number of parallel scraping tasks (default: 10).
|
|
||||||
/// This limits concurrency to protect system load and prevent website spamming.
|
|
||||||
#[serde(default = "default_max_parallel")]
|
|
||||||
pub max_parallel_tasks: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn default_max_parallel() -> usize {
|
#[serde(default = "default_max_parallel_instances")]
|
||||||
10
|
pub max_parallel_instances: usize,
|
||||||
|
|
||||||
|
pub max_tasks_per_instance: usize,
|
||||||
|
|
||||||
|
#[serde(default = "default_enable_vpn_rotation")]
|
||||||
|
pub enable_vpn_rotation: bool,
|
||||||
|
|
||||||
|
// IMPROVEMENT: Reduzierte Defaults für weniger aggressive Scraping
|
||||||
|
#[serde(default = "default_max_requests_per_session")]
|
||||||
|
pub max_requests_per_session: usize,
|
||||||
|
|
||||||
|
#[serde(default = "default_min_request_interval_ms")]
|
||||||
|
pub min_request_interval_ms: u64,
|
||||||
|
|
||||||
|
#[serde(default = "default_max_retry_attempts")]
|
||||||
|
pub max_retry_attempts: u32,
|
||||||
|
|
||||||
|
#[serde(default = "default_proxy_instances_per_certificate")]
|
||||||
|
pub proxy_instances_per_certificate: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
@@ -26,26 +38,42 @@ impl Default for Config {
|
|||||||
economic_start_date: "2007-02-13".to_string(),
|
economic_start_date: "2007-02-13".to_string(),
|
||||||
corporate_start_date: "2010-01-01".to_string(),
|
corporate_start_date: "2010-01-01".to_string(),
|
||||||
economic_lookahead_months: 3,
|
economic_lookahead_months: 3,
|
||||||
max_parallel_tasks: default_max_parallel(),
|
max_parallel_instances: default_max_parallel_instances(),
|
||||||
|
max_tasks_per_instance: 0,
|
||||||
|
max_requests_per_session: default_max_requests_per_session(),
|
||||||
|
min_request_interval_ms: default_min_request_interval_ms(),
|
||||||
|
max_retry_attempts: default_max_retry_attempts(),
|
||||||
|
enable_vpn_rotation: false,
|
||||||
|
proxy_instances_per_certificate: default_proxy_instances_per_certificate(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn default_enable_vpn_rotation() -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_parallel_instances() -> usize {
|
||||||
|
4
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_requests_per_session() -> usize {
|
||||||
|
10
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_min_request_interval_ms() -> u64 {
|
||||||
|
1200
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_retry_attempts() -> u32 { 3 }
|
||||||
|
|
||||||
|
fn default_proxy_instances_per_certificate() -> Option<usize> {
|
||||||
|
Some(1)
|
||||||
|
}
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
/// Loads the configuration from environment variables using dotenvy.
|
/// Loads configuration from environment variables using dotenvy.
|
||||||
///
|
|
||||||
/// This function loads a `.env` file if present (via `dotenvy::dotenv()`),
|
|
||||||
/// then retrieves each configuration value from environment variables.
|
|
||||||
/// If a variable is missing, it falls back to the default value.
|
|
||||||
/// Variable names are uppercase with underscores (e.g., ECONOMIC_START_DATE).
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// The loaded Config on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if parsing fails (e.g., invalid integer for lookahead months).
|
|
||||||
pub fn load() -> Result<Self> {
|
pub fn load() -> Result<Self> {
|
||||||
// Load .env file if it exists; ignore if not found (dotenvy::dotenv returns Ok if no file)
|
|
||||||
let _ = dotenvy::dotenv().context("Failed to load .env file (optional)")?;
|
let _ = dotenvy::dotenv().context("Failed to load .env file (optional)")?;
|
||||||
|
|
||||||
let economic_start_date = dotenvy::var("ECONOMIC_START_DATE")
|
let economic_start_date = dotenvy::var("ECONOMIC_START_DATE")
|
||||||
@@ -59,16 +87,53 @@ impl Config {
|
|||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
||||||
|
|
||||||
let max_parallel_tasks: usize = dotenvy::var("MAX_PARALLEL_TASKS")
|
// IMPROVEMENT: Reduzierte Defaults
|
||||||
.unwrap_or_else(|_| "10".to_string())
|
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
|
||||||
|
.unwrap_or_else(|_| "4".to_string()) // Geändert von 10
|
||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse MAX_PARALLEL_TASKS as usize")?;
|
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
|
||||||
|
|
||||||
|
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
|
||||||
|
.unwrap_or_else(|_| "5".to_string()) // Geändert von 0
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
|
||||||
|
|
||||||
|
let enable_vpn_rotation = dotenvy::var("ENABLE_VPN_ROTATION")
|
||||||
|
.unwrap_or_else(|_| "false".to_string())
|
||||||
|
.parse::<bool>()
|
||||||
|
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
||||||
|
|
||||||
|
let max_requests_per_session: usize = dotenvy::var("MAX_REQUESTS_PER_SESSION")
|
||||||
|
.unwrap_or_else(|_| "10".to_string()) // Geändert von 25
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_REQUESTS_PER_SESSION as usize")?;
|
||||||
|
|
||||||
|
let min_request_interval_ms: u64 = dotenvy::var("MIN_REQUEST_INTERVAL_MS")
|
||||||
|
.unwrap_or_else(|_| "1200".to_string()) // Geändert von 300
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MIN_REQUEST_INTERVAL_MS as u64")?;
|
||||||
|
|
||||||
|
let max_retry_attempts: u32 = dotenvy::var("MAX_RETRY_ATTEMPTS")
|
||||||
|
.unwrap_or_else(|_| "3".to_string())
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_RETRY_ATTEMPTS as u32")?;
|
||||||
|
|
||||||
|
let proxy_instances_per_certificate: Option<usize> = match dotenvy::var("PROXY_INSTANCES_PER_CERTIFICATE") {
|
||||||
|
Ok(val) => Some(val.parse().context("Failed to parse PROXY_INSTANCES_PER_CERTIFICATE as usize")?),
|
||||||
|
Err(_) => Some(1),
|
||||||
|
};
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
economic_start_date,
|
economic_start_date,
|
||||||
corporate_start_date,
|
corporate_start_date,
|
||||||
economic_lookahead_months,
|
economic_lookahead_months,
|
||||||
max_parallel_tasks,
|
max_parallel_instances,
|
||||||
|
max_tasks_per_instance,
|
||||||
|
enable_vpn_rotation,
|
||||||
|
max_requests_per_session,
|
||||||
|
min_request_interval_ms,
|
||||||
|
max_retry_attempts,
|
||||||
|
proxy_instances_per_certificate,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,194 +0,0 @@
|
|||||||
// src/corporate/aggregation.rs
|
|
||||||
use super::types::CompanyPrice;
|
|
||||||
use super::storage::*;
|
|
||||||
use tokio::fs;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
struct DayData {
|
|
||||||
sources: Vec<(CompanyPrice, String)>, // (price, source_ticker)
|
|
||||||
total_volume: u64,
|
|
||||||
vwap: f64,
|
|
||||||
open: f64,
|
|
||||||
high: f64,
|
|
||||||
low: f64,
|
|
||||||
close: f64,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Aggregate price data from multiple exchanges, converting all to USD
|
|
||||||
pub async fn aggregate_best_price_data(lei: &str) -> anyhow::Result<()> {
|
|
||||||
let company_dir = get_company_dir(lei);
|
|
||||||
|
|
||||||
for timeframe in ["daily", "5min"].iter() {
|
|
||||||
let source_dir = company_dir.join(timeframe);
|
|
||||||
if !source_dir.exists() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut all_prices: Vec<(CompanyPrice, String)> = Vec::new();
|
|
||||||
let mut by_date_time: HashMap<String, DayData> = HashMap::new();
|
|
||||||
|
|
||||||
// Load all sources with their ticker names
|
|
||||||
let mut entries = tokio::fs::read_dir(&source_dir).await?;
|
|
||||||
let mut source_count = 0;
|
|
||||||
let mut sources_used = std::collections::HashSet::new();
|
|
||||||
|
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
|
||||||
let source_dir_path = entry.path();
|
|
||||||
if !source_dir_path.is_dir() { continue; }
|
|
||||||
|
|
||||||
let source_ticker = source_dir_path
|
|
||||||
.file_name()
|
|
||||||
.and_then(|n| n.to_str())
|
|
||||||
.unwrap_or("unknown")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let prices_path = source_dir_path.join("prices.json");
|
|
||||||
if !prices_path.exists() { continue; }
|
|
||||||
|
|
||||||
let content = tokio::fs::read_to_string(&prices_path).await?;
|
|
||||||
let mut prices: Vec<CompanyPrice> = serde_json::from_str(&content)?;
|
|
||||||
|
|
||||||
if !prices.is_empty() {
|
|
||||||
sources_used.insert(source_ticker.clone());
|
|
||||||
source_count += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for price in prices {
|
|
||||||
all_prices.push((price, source_ticker.clone()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if all_prices.is_empty() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
println!(" Aggregating from {} exchanges: {}",
|
|
||||||
sources_used.len(),
|
|
||||||
sources_used.iter()
|
|
||||||
.map(|s| s.as_str())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join(", ")
|
|
||||||
);
|
|
||||||
|
|
||||||
// Group by date + time (for 5min) or just date
|
|
||||||
for (p, source) in all_prices {
|
|
||||||
let key = if timeframe == &"5min" && !p.time.is_empty() {
|
|
||||||
format!("{}_{}", p.date, p.time)
|
|
||||||
} else {
|
|
||||||
p.date.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
// Convert to USD immediately
|
|
||||||
let usd_rate = super::fx::get_usd_rate(&p.currency).await.unwrap_or(1.0);
|
|
||||||
|
|
||||||
let mut p_usd = p.clone();
|
|
||||||
p_usd.open *= usd_rate;
|
|
||||||
p_usd.high *= usd_rate;
|
|
||||||
p_usd.low *= usd_rate;
|
|
||||||
p_usd.close *= usd_rate;
|
|
||||||
p_usd.adj_close *= usd_rate;
|
|
||||||
p_usd.currency = "USD".to_string();
|
|
||||||
|
|
||||||
let entry = by_date_time.entry(key.clone()).or_insert(DayData {
|
|
||||||
sources: vec![],
|
|
||||||
total_volume: 0,
|
|
||||||
vwap: 0.0,
|
|
||||||
open: p_usd.open,
|
|
||||||
high: p_usd.high,
|
|
||||||
low: p_usd.low,
|
|
||||||
close: p_usd.close,
|
|
||||||
});
|
|
||||||
|
|
||||||
let volume = p.volume.max(1); // avoid div0
|
|
||||||
let vwap_contrib = p_usd.close * volume as f64;
|
|
||||||
|
|
||||||
entry.sources.push((p_usd.clone(), source));
|
|
||||||
entry.total_volume += volume;
|
|
||||||
entry.vwap += vwap_contrib;
|
|
||||||
|
|
||||||
// Use first open, last close, max high, min low
|
|
||||||
if entry.sources.len() == 1 {
|
|
||||||
entry.open = p_usd.open;
|
|
||||||
}
|
|
||||||
entry.close = p_usd.close;
|
|
||||||
entry.high = entry.high.max(p_usd.high);
|
|
||||||
entry.low = entry.low.min(p_usd.low);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finalize aggregated data
|
|
||||||
let mut aggregated: Vec<CompanyPrice> = Vec::new();
|
|
||||||
|
|
||||||
for (key, data) in by_date_time {
|
|
||||||
let vwap = data.vwap / data.total_volume as f64;
|
|
||||||
|
|
||||||
let (date, time) = if key.contains('_') {
|
|
||||||
let parts: Vec<&str> = key.split('_').collect();
|
|
||||||
(parts[0].to_string(), parts[1].to_string())
|
|
||||||
} else {
|
|
||||||
(key, "".to_string())
|
|
||||||
};
|
|
||||||
|
|
||||||
// Track which exchange contributed most volume
|
|
||||||
let best_source = data.sources.iter()
|
|
||||||
.max_by_key(|(p, _)| p.volume)
|
|
||||||
.map(|(_, src)| src.clone())
|
|
||||||
.unwrap_or_else(|| "unknown".to_string());
|
|
||||||
|
|
||||||
aggregated.push(CompanyPrice {
|
|
||||||
ticker: format!("{lei}@agg"), // Mark as aggregated
|
|
||||||
date,
|
|
||||||
time,
|
|
||||||
open: data.open,
|
|
||||||
high: data.high,
|
|
||||||
low: data.low,
|
|
||||||
close: data.close,
|
|
||||||
adj_close: vwap,
|
|
||||||
volume: data.total_volume,
|
|
||||||
currency: "USD".to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
aggregated.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
|
||||||
|
|
||||||
// Save aggregated result
|
|
||||||
let agg_dir = company_dir.join("aggregated").join(timeframe);
|
|
||||||
fs::create_dir_all(&agg_dir).await?;
|
|
||||||
let path = agg_dir.join("prices.json");
|
|
||||||
fs::write(&path, serde_json::to_string_pretty(&aggregated)?).await?;
|
|
||||||
|
|
||||||
// Save aggregation metadata
|
|
||||||
let meta = AggregationMetadata {
|
|
||||||
lei: lei.to_string(), // ← CHANGE THIS
|
|
||||||
timeframe: timeframe.to_string(),
|
|
||||||
sources: sources_used.into_iter().collect(),
|
|
||||||
total_bars: aggregated.len(),
|
|
||||||
date_range: (
|
|
||||||
aggregated.first().map(|p| p.date.clone()).unwrap_or_default(),
|
|
||||||
aggregated.last().map(|p| p.date.clone()).unwrap_or_default(),
|
|
||||||
),
|
|
||||||
aggregated_at: chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let meta_path = agg_dir.join("metadata.json");
|
|
||||||
fs::write(&meta_path, serde_json::to_string_pretty(&meta)?).await?;
|
|
||||||
|
|
||||||
println!(" ✓ {} {} bars from {} sources (USD)",
|
|
||||||
aggregated.len(),
|
|
||||||
timeframe,
|
|
||||||
source_count
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
|
||||||
struct AggregationMetadata {
|
|
||||||
lei: String,
|
|
||||||
timeframe: String,
|
|
||||||
sources: Vec<String>,
|
|
||||||
total_bars: usize,
|
|
||||||
date_range: (String, String),
|
|
||||||
aggregated_at: String,
|
|
||||||
}
|
|
||||||
273
src/corporate/bond_processing.rs
Normal file
273
src/corporate/bond_processing.rs
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
// src/corporate/bond_processing.rs
|
||||||
|
// Bond-specific processing logic for corporate and government bonds
|
||||||
|
|
||||||
|
use super::types::*;
|
||||||
|
|
||||||
|
/// Parse bond details from ticker and security description
|
||||||
|
///
|
||||||
|
/// Examples:
|
||||||
|
/// - "WTFC 4.3 01/12/26 0003" -> coupon: 4.3, maturity: 2026-01-12
|
||||||
|
/// - "SLOVAK 1.5225 05/10/28 4Y" -> coupon: 1.5225, maturity: 2028-05-10
|
||||||
|
/// - "SEK Float 06/30/34" -> floating rate, maturity: 2034-06-30
|
||||||
|
/// - "GGB 0 10/15/42" -> zero coupon, maturity: 2042-10-15
|
||||||
|
pub fn parse_bond_details(ticker: &str, security_description: &str) -> BondDetails {
|
||||||
|
let mut details = BondDetails {
|
||||||
|
coupon_rate: None,
|
||||||
|
maturity_date: None,
|
||||||
|
is_floating: false,
|
||||||
|
is_zero_coupon: false,
|
||||||
|
tenor_years: None,
|
||||||
|
series_identifier: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check for floating rate - look for "Float", " F ", "V0" patterns
|
||||||
|
if ticker.contains("Float") || ticker.contains(" F ") || ticker.contains(" V0 ")
|
||||||
|
|| security_description.contains("Float") {
|
||||||
|
details.is_floating = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse coupon rate if not floating
|
||||||
|
if !details.is_floating {
|
||||||
|
if let Some(coupon) = extract_coupon_rate(ticker, security_description) {
|
||||||
|
details.coupon_rate = Some(coupon);
|
||||||
|
details.is_zero_coupon = coupon == 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse maturity date
|
||||||
|
if let Some(maturity) = extract_maturity_date(ticker, security_description) {
|
||||||
|
details.maturity_date = Some(maturity.clone());
|
||||||
|
|
||||||
|
// Calculate tenor (simplified - just extract year)
|
||||||
|
if let Some(year_str) = maturity.split('-').next() {
|
||||||
|
if let Ok(mat_year) = year_str.parse::<i32>() {
|
||||||
|
let current_year = 2026; // From system prompt
|
||||||
|
let years_to_maturity = (mat_year - current_year).max(0) as u32;
|
||||||
|
details.tenor_years = Some(years_to_maturity);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract series identifier
|
||||||
|
details.series_identifier = extract_series_identifier(ticker);
|
||||||
|
|
||||||
|
details
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract coupon rate from ticker/description
|
||||||
|
/// Handles: "4.3", "1.5225", "12 1/2" (fractional), "0"
|
||||||
|
fn extract_coupon_rate(ticker: &str, description: &str) -> Option<f64> {
|
||||||
|
let text = format!("{} {}", ticker, description);
|
||||||
|
|
||||||
|
// Pattern 1: Fractional rates like "12 1/2" -> 12.5
|
||||||
|
if let Some(frac_result) = parse_fractional_coupon(&text) {
|
||||||
|
return Some(frac_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pattern 2: Decimal rates like "4.3" or "1.5225"
|
||||||
|
// Look for number followed by space and date pattern
|
||||||
|
let parts: Vec<&str> = text.split_whitespace().collect();
|
||||||
|
for i in 0..parts.len() {
|
||||||
|
if let Ok(rate) = parts[i].parse::<f64>() {
|
||||||
|
// Sanity check: coupon rates are typically 0-20%
|
||||||
|
if rate >= 0.0 && rate <= 20.0 {
|
||||||
|
// Make sure it's before a date-like pattern
|
||||||
|
if i + 1 < parts.len() {
|
||||||
|
let next = parts[i + 1];
|
||||||
|
if next.contains('/') || next.len() >= 8 {
|
||||||
|
return Some(rate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse fractional coupon like "12 1/2" -> 12.5
|
||||||
|
fn parse_fractional_coupon(text: &str) -> Option<f64> {
|
||||||
|
let parts: Vec<&str> = text.split_whitespace().collect();
|
||||||
|
|
||||||
|
for i in 0..parts.len().saturating_sub(1) {
|
||||||
|
// Check if current part is a number
|
||||||
|
if let Ok(whole) = parts[i].parse::<f64>() {
|
||||||
|
// Check if next part is a fraction like "1/2"
|
||||||
|
if let Some(slash_pos) = parts[i + 1].find('/') {
|
||||||
|
let frac_str = parts[i + 1];
|
||||||
|
let num_str = &frac_str[..slash_pos];
|
||||||
|
let den_str = &frac_str[slash_pos + 1..];
|
||||||
|
|
||||||
|
if let (Ok(num), Ok(den)) = (num_str.parse::<f64>(), den_str.parse::<f64>()) {
|
||||||
|
if den != 0.0 {
|
||||||
|
return Some(whole + num / den);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract maturity date from ticker/description
|
||||||
|
/// Handles: "01/12/26", "05/10/28", "06/30/2034"
|
||||||
|
fn extract_maturity_date(ticker: &str, description: &str) -> Option<String> {
|
||||||
|
let text = format!("{} {}", ticker, description);
|
||||||
|
|
||||||
|
// Look for MM/DD/YY or MM/DD/YYYY patterns
|
||||||
|
let parts: Vec<&str> = text.split_whitespace().collect();
|
||||||
|
|
||||||
|
for part in parts {
|
||||||
|
if let Some(date) = parse_date_pattern(part) {
|
||||||
|
return Some(date);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse various date formats to YYYY-MM-DD
|
||||||
|
fn parse_date_pattern(s: &str) -> Option<String> {
|
||||||
|
let slash_count = s.matches('/').count();
|
||||||
|
|
||||||
|
if slash_count != 2 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let parts: Vec<&str> = s.split('/').collect();
|
||||||
|
if parts.len() != 3 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let month = parts[0];
|
||||||
|
let day = parts[1];
|
||||||
|
let year_part = parts[2];
|
||||||
|
|
||||||
|
// Parse year - could be 2 or 4 digits
|
||||||
|
let year = if year_part.len() == 2 {
|
||||||
|
if let Ok(yy) = year_part.parse::<u32>() {
|
||||||
|
// Assume 20xx for values <= 50, 19xx for > 50
|
||||||
|
if yy <= 50 {
|
||||||
|
format!("{}", 2000 + yy)
|
||||||
|
} else {
|
||||||
|
format!("{}", 1900 + yy)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
} else if year_part.len() == 4 {
|
||||||
|
year_part.to_string()
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Validate month and day
|
||||||
|
if let (Ok(m), Ok(d)) = (month.parse::<u32>(), day.parse::<u32>()) {
|
||||||
|
if m >= 1 && m <= 12 && d >= 1 && d <= 31 {
|
||||||
|
return Some(format!("{}-{:02}-{:02}", year, m, d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract series identifier (tokens after the date)
|
||||||
|
/// Examples: "0003", "4Y", "144A", "REGS", "MTN", "PSI", "CD"
|
||||||
|
fn extract_series_identifier(ticker: &str) -> Option<String> {
|
||||||
|
let parts: Vec<&str> = ticker.split_whitespace().collect();
|
||||||
|
|
||||||
|
// Look for date pattern, then take what comes after
|
||||||
|
for i in 0..parts.len() {
|
||||||
|
if parts[i].contains('/') && parts[i].matches('/').count() == 2 {
|
||||||
|
// Found date, check if there's something after
|
||||||
|
if i + 1 < parts.len() {
|
||||||
|
return Some(parts[i + 1].to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Classify government issuer type
|
||||||
|
pub fn classify_government_issuer(name: &str) -> String {
|
||||||
|
let name_lower = name.to_lowercase();
|
||||||
|
|
||||||
|
// Sovereign nations
|
||||||
|
if name_lower.contains("republic")
|
||||||
|
|| name_lower.contains("kingdom")
|
||||||
|
|| name_lower.contains("federal republic")
|
||||||
|
|| name_lower.ends_with(" govt")
|
||||||
|
|| name_lower.ends_with(" government")
|
||||||
|
|| name_lower.contains("hellenic") // Greece
|
||||||
|
|| name_lower.contains("slovak") {
|
||||||
|
return "sovereign".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Municipalities (Norwegian communes, cities, etc.)
|
||||||
|
if name_lower.contains("kommune")
|
||||||
|
|| name_lower.contains("municipality")
|
||||||
|
|| name_lower.contains("city of")
|
||||||
|
|| name_lower.contains("town of")
|
||||||
|
|| name_lower.contains("county council") {
|
||||||
|
return "municipal".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
// States/Provinces/Regions
|
||||||
|
if name_lower.contains("state of")
|
||||||
|
|| name_lower.contains("province")
|
||||||
|
|| name_lower.contains("region")
|
||||||
|
|| name_lower.contains("county") {
|
||||||
|
return "state".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Government agencies/entities
|
||||||
|
if name_lower.contains("export credit")
|
||||||
|
|| name_lower.contains("development bank")
|
||||||
|
|| name_lower.contains("housing")
|
||||||
|
|| name_lower.contains("akademiska")
|
||||||
|
|| name_lower.contains("byggdastofnun") {
|
||||||
|
return "agency".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
"other".to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Classify government bond type based on security_type
|
||||||
|
///
|
||||||
|
/// Maps OpenFIGI security types to simplified bond categories for government bonds
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
/// - "DOMESTIC" -> "domestic"
|
||||||
|
/// - "GLOBAL" -> "global"
|
||||||
|
/// - "EURO NON-DOLLAR" -> "euro"
|
||||||
|
/// - "DOMESTIC MTN" -> "mtn"
|
||||||
|
pub fn classify_government_bond_type(security_type: &str) -> String {
|
||||||
|
let security_type_upper = security_type.to_uppercase();
|
||||||
|
|
||||||
|
if security_type_upper.contains("GLOBAL") {
|
||||||
|
return "global".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
if security_type_upper.contains("EURO") {
|
||||||
|
if security_type_upper.contains("NON-DOLLAR") || !security_type_upper.contains("DOLLAR") {
|
||||||
|
return "euro".to_string();
|
||||||
|
}
|
||||||
|
return "eurodollar".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
if security_type_upper.contains("YANKEE") {
|
||||||
|
return "yankee".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
if security_type_upper.contains("MTN") {
|
||||||
|
return "mtn".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
if security_type_upper.contains("DOMESTIC") {
|
||||||
|
return "domestic".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
"other".to_string()
|
||||||
|
}
|
||||||
215
src/corporate/checkpoint_helpers.rs
Normal file
215
src/corporate/checkpoint_helpers.rs
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
// src/corporate/checkpoint_helpers.rs
|
||||||
|
//! Shared helpers for checkpoint-based recovery and logging
|
||||||
|
//!
|
||||||
|
//! This module extracts common patterns used across multiple update modules
|
||||||
|
//! to reduce code duplication and improve maintainability.
|
||||||
|
|
||||||
|
use super::types::CompanyData;
|
||||||
|
use crate::util::logger;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::{Path};
|
||||||
|
use tokio::fs::{File};
|
||||||
|
use tokio::io::{AsyncWriteExt};
|
||||||
|
use anyhow::Result;
|
||||||
|
|
||||||
|
/// Load companies from checkpoint and replay log for recovery
|
||||||
|
///
|
||||||
|
/// This function implements the checkpoint + write-ahead log pattern:
|
||||||
|
/// 1. Loads the main checkpoint file
|
||||||
|
/// 2. Replays any pending updates from the log file
|
||||||
|
/// 3. Returns the merged state
|
||||||
|
pub async fn load_checkpoint_with_log<P1, P2>(
|
||||||
|
checkpoint_path: P1,
|
||||||
|
log_path: P2,
|
||||||
|
checkpoint_desc: &str,
|
||||||
|
) -> Result<HashMap<String, CompanyData>>
|
||||||
|
where
|
||||||
|
P1: AsRef<Path>,
|
||||||
|
P2: AsRef<Path>,
|
||||||
|
{
|
||||||
|
let checkpoint_path = checkpoint_path.as_ref();
|
||||||
|
let log_path = log_path.as_ref();
|
||||||
|
|
||||||
|
let mut companies: HashMap<String, CompanyData> = HashMap::new();
|
||||||
|
|
||||||
|
// Load checkpoint if it exists
|
||||||
|
if checkpoint_path.exists() {
|
||||||
|
logger::log_info(&format!("Loading checkpoint from {}...", checkpoint_desc)).await;
|
||||||
|
let content = tokio::fs::read_to_string(checkpoint_path).await?;
|
||||||
|
|
||||||
|
for line in content.lines() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyData>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
companies.insert(company.name.clone(), company);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger::log_info(&format!("Loaded checkpoint with {} companies", companies.len())).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replay log if it exists
|
||||||
|
if log_path.exists() {
|
||||||
|
logger::log_info("Replaying update log...").await;
|
||||||
|
let log_content = tokio::fs::read_to_string(log_path).await?;
|
||||||
|
let mut replayed = 0;
|
||||||
|
|
||||||
|
for line in log_content.lines() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyData>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
companies.insert(company.name.clone(), company);
|
||||||
|
replayed += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if replayed > 0 {
|
||||||
|
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(companies)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consolidate log into checkpoint and clear log
|
||||||
|
///
|
||||||
|
/// Atomically writes all companies to a new checkpoint file and removes the log.
|
||||||
|
/// Uses atomic rename to ensure crash safety.
|
||||||
|
pub async fn consolidate_checkpoint<P1, P2>(
|
||||||
|
checkpoint_path: P1,
|
||||||
|
log_path: P2,
|
||||||
|
companies: &HashMap<String, CompanyData>,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
P1: AsRef<Path>,
|
||||||
|
P2: AsRef<Path>,
|
||||||
|
{
|
||||||
|
let checkpoint_path = checkpoint_path.as_ref();
|
||||||
|
let log_path = log_path.as_ref();
|
||||||
|
|
||||||
|
logger::log_info("Consolidating update log into checkpoint...").await;
|
||||||
|
|
||||||
|
let temp_checkpoint = checkpoint_path.with_extension("tmp");
|
||||||
|
let mut temp_file = File::create(&temp_checkpoint).await?;
|
||||||
|
|
||||||
|
for company in companies.values() {
|
||||||
|
let json_line = serde_json::to_string(company)?;
|
||||||
|
temp_file.write_all(json_line.as_bytes()).await?;
|
||||||
|
temp_file.write_all(b"\n").await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
temp_file.flush().await?;
|
||||||
|
temp_file.sync_data().await?;
|
||||||
|
drop(temp_file);
|
||||||
|
|
||||||
|
tokio::fs::rename(&temp_checkpoint, checkpoint_path).await?;
|
||||||
|
|
||||||
|
// Remove log after successful consolidation
|
||||||
|
if log_path.exists() {
|
||||||
|
tokio::fs::remove_file(log_path).await.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("✓ Consolidated {} companies", companies.len())).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if log file has content
|
||||||
|
pub async fn log_has_content<P: AsRef<Path>>(log_path: P) -> bool {
|
||||||
|
if let Ok(metadata) = tokio::fs::metadata(log_path.as_ref()).await {
|
||||||
|
metadata.len() > 0
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load enrichment progress from log file
|
||||||
|
///
|
||||||
|
/// Used by enrichment functions to track which companies have already been processed.
|
||||||
|
/// Parses log entries with format: {"company_name": "...", "status": "enriched", ...}
|
||||||
|
pub async fn load_enrichment_progress<P>(
|
||||||
|
log_path: P,
|
||||||
|
) -> Result<std::collections::HashSet<String>>
|
||||||
|
where
|
||||||
|
P: AsRef<Path>,
|
||||||
|
{
|
||||||
|
let mut enriched_companies = std::collections::HashSet::new();
|
||||||
|
|
||||||
|
if !log_path.as_ref().exists() {
|
||||||
|
return Ok(enriched_companies);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("Loading enrichment progress from log...").await;
|
||||||
|
let log_content = tokio::fs::read_to_string(log_path.as_ref()).await?;
|
||||||
|
|
||||||
|
for line in log_content.lines() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<serde_json::Value>(line) {
|
||||||
|
Ok(entry) => {
|
||||||
|
if let Some(name) = entry.get("company_name").and_then(|v| v.as_str()) {
|
||||||
|
if entry.get("status").and_then(|v| v.as_str()) == Some("enriched") {
|
||||||
|
enriched_companies.insert(name.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Loaded {} enriched companies from log",
|
||||||
|
enriched_companies.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
Ok(enriched_companies)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count enriched companies by checking for data files
|
||||||
|
///
|
||||||
|
/// Walks through the corporate directory and counts companies that have
|
||||||
|
/// a data file in the specified subdirectory (e.g., "events", "options", "chart").
|
||||||
|
pub async fn count_enriched_companies(
|
||||||
|
paths: &crate::util::directories::DataPaths,
|
||||||
|
data_type: &str,
|
||||||
|
) -> Result<usize> {
|
||||||
|
let corporate_dir = paths.corporate_dir();
|
||||||
|
|
||||||
|
if !corporate_dir.exists() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut count = 0;
|
||||||
|
let mut entries = tokio::fs::read_dir(&corporate_dir).await?;
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_dir() {
|
||||||
|
let data_dir = path.join(data_type);
|
||||||
|
let data_file = data_dir.join("data.jsonl");
|
||||||
|
|
||||||
|
if data_file.exists() {
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(count)
|
||||||
|
}
|
||||||
720
src/corporate/collect_exchanges.rs
Normal file
720
src/corporate/collect_exchanges.rs
Normal file
@@ -0,0 +1,720 @@
|
|||||||
|
// src/corporate/collect_exchanges.rs
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::integrity::{DataStage, StateEntry, StateManager, file_reference};
|
||||||
|
use crate::util::logger;
|
||||||
|
use crate::corporate::types::*;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use tokio::fs;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
|
||||||
|
/// Exchange information collected from company data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ExchangeInfo {
|
||||||
|
#[serde(rename = "exchangeName")]
|
||||||
|
pub exchange_name: String,
|
||||||
|
pub currency: String,
|
||||||
|
#[serde(rename = "currencySymbol")]
|
||||||
|
pub currency_symbol: String,
|
||||||
|
#[serde(rename = "exchangeDataDelayedBy")]
|
||||||
|
pub exchange_data_delayed_by: i64,
|
||||||
|
#[serde(rename = "totalMarketCap")]
|
||||||
|
pub total_market_cap: u64,
|
||||||
|
#[serde(rename = "totalMarketCapUSD")]
|
||||||
|
pub total_market_cap_usd: f64, // NEW: Market cap converted to USD
|
||||||
|
pub companies: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract exchange data from company core data
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct CompanyCoreData {
|
||||||
|
modules: Option<CoreModules>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct CoreModules {
|
||||||
|
price: Option<PriceModule>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct PriceModule {
|
||||||
|
#[serde(rename = "exchangeName")]
|
||||||
|
exchange_name: Option<String>,
|
||||||
|
currency: Option<String>,
|
||||||
|
#[serde(rename = "currencySymbol")]
|
||||||
|
currency_symbol: Option<String>,
|
||||||
|
exchange: Option<String>,
|
||||||
|
#[serde(rename = "exchangeDataDelayedBy")]
|
||||||
|
exchange_data_delayed_by: Option<i64>,
|
||||||
|
#[serde(rename = "marketCap")]
|
||||||
|
market_cap: Option<MarketCapData>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct MarketCapData {
|
||||||
|
raw: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize currency code and get conversion factor
|
||||||
|
/// Handles special cases like GBp (pence) and ZAc (cents)
|
||||||
|
fn normalize_currency(currency: &str) -> (&str, f64) {
|
||||||
|
match currency {
|
||||||
|
"GBp" => ("GBP", 100.0), // British Pence -> Pounds (divide by 100)
|
||||||
|
"ZAc" => ("ZAR", 100.0), // South African Cents -> Rand (divide by 100)
|
||||||
|
_ => (currency, 1.0), // No conversion needed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// FX rate cache for currency conversion
|
||||||
|
struct FxRateCache {
|
||||||
|
rates: HashMap<String, f64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FxRateCache {
|
||||||
|
/// Create new FX rate cache by loading all currency charts
|
||||||
|
async fn new(paths: &DataPaths) -> anyhow::Result<Self> {
|
||||||
|
let mut rates = HashMap::new();
|
||||||
|
|
||||||
|
// USD to USD is always 1.0
|
||||||
|
rates.insert("USD".to_string(), 1.0);
|
||||||
|
|
||||||
|
let currency_dir = paths.data_dir().join("economic").join("currency");
|
||||||
|
|
||||||
|
if !currency_dir.exists() {
|
||||||
|
logger::log_warn(" FX rates directory not found - will use default rates").await;
|
||||||
|
return Ok(Self { rates });
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut entries = fs::read_dir(¤cy_dir).await?;
|
||||||
|
let mut loaded_count = 0;
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if !path.is_dir() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let currency_code = match path.file_name().and_then(|n| n.to_str()) {
|
||||||
|
Some(code) => code.to_string(),
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
let chart_path = path.join("chart").join("data.jsonl");
|
||||||
|
|
||||||
|
if !chart_path.exists() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load chart and get latest rate
|
||||||
|
match load_latest_fx_rate(&chart_path).await {
|
||||||
|
Ok(rate) => {
|
||||||
|
rates.insert(currency_code.clone(), rate);
|
||||||
|
loaded_count += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
" Failed to load FX rate for {}: {}",
|
||||||
|
currency_code, e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!(" ✓ Loaded {} FX rates", loaded_count)).await;
|
||||||
|
|
||||||
|
Ok(Self { rates })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert amount from given currency to USD
|
||||||
|
fn to_usd(&self, amount: u64, currency: &str) -> f64 {
|
||||||
|
// Normalize currency and get conversion factor
|
||||||
|
// e.g., GBp -> (GBP, 100.0), ZAc -> (ZAR, 100.0)
|
||||||
|
let (normalized_currency, factor) = normalize_currency(currency);
|
||||||
|
|
||||||
|
// First convert to base currency unit (e.g., pence to pounds)
|
||||||
|
let amount_in_base = amount as f64 / factor;
|
||||||
|
|
||||||
|
if normalized_currency == "USD" {
|
||||||
|
return amount_in_base;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get rate (USD per currency unit)
|
||||||
|
// For USD/EUR = 0.92, this means 1 USD = 0.92 EUR
|
||||||
|
// To convert EUR to USD: EUR_amount / 0.92
|
||||||
|
match self.rates.get(normalized_currency) {
|
||||||
|
Some(&rate) if rate > 0.0 => {
|
||||||
|
amount_in_base / rate
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Fallback: use approximate rates for common currencies
|
||||||
|
let fallback_rate = get_fallback_rate(normalized_currency);
|
||||||
|
amount_in_base / fallback_rate
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get rate for a currency (USD per unit)
|
||||||
|
fn get_rate(&self, currency: &str) -> Option<f64> {
|
||||||
|
let (normalized_currency, _) = normalize_currency(currency);
|
||||||
|
self.rates.get(normalized_currency).copied()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load latest FX rate from chart data
|
||||||
|
async fn load_latest_fx_rate(chart_path: &std::path::Path) -> anyhow::Result<f64> {
|
||||||
|
let content = fs::read_to_string(chart_path).await?;
|
||||||
|
|
||||||
|
for line in content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let chart: ChartData = serde_json::from_str(line)?;
|
||||||
|
|
||||||
|
if chart.quotes.is_empty() {
|
||||||
|
return Err(anyhow::anyhow!("No quotes in chart data"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get most recent quote with a close price
|
||||||
|
let latest_rate = chart.quotes
|
||||||
|
.iter()
|
||||||
|
.rev()
|
||||||
|
.find_map(|q| q.close)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("No valid close prices"))?;
|
||||||
|
|
||||||
|
return Ok(latest_rate);
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(anyhow::anyhow!("No data in chart file"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fallback rates for common currencies (approximate, as of 2024)
|
||||||
|
/// These are USD per currency unit (same format as our FX data)
|
||||||
|
fn get_fallback_rate(currency: &str) -> f64 {
|
||||||
|
match currency {
|
||||||
|
"USD" => 1.0,
|
||||||
|
"EUR" => 0.92, // 1 USD = 0.92 EUR
|
||||||
|
"GBP" => 0.79, // 1 USD = 0.79 GBP
|
||||||
|
"JPY" => 150.0, // 1 USD = 150 JPY
|
||||||
|
"CNY" | "RMB" => 7.2,
|
||||||
|
"CHF" => 0.88,
|
||||||
|
"AUD" => 1.52,
|
||||||
|
"CAD" => 1.36,
|
||||||
|
"HKD" => 7.8,
|
||||||
|
"SGD" => 1.34,
|
||||||
|
"SEK" => 10.5,
|
||||||
|
"NOK" => 10.8,
|
||||||
|
"DKK" => 6.9,
|
||||||
|
"PLN" => 4.0,
|
||||||
|
"CZK" => 23.0,
|
||||||
|
"TRY" => 32.0,
|
||||||
|
"ZAR" => 18.5,
|
||||||
|
"ILS" => 3.7,
|
||||||
|
"RON" => 4.6,
|
||||||
|
"KWD" => 0.31,
|
||||||
|
"TWD" => 31.5,
|
||||||
|
"ISK" => 138.0,
|
||||||
|
"NZD" => 1.65,
|
||||||
|
"MXN" => 17.0,
|
||||||
|
"BRL" => 5.0,
|
||||||
|
"INR" => 83.0,
|
||||||
|
"KRW" => 1320.0,
|
||||||
|
"THB" => 35.0,
|
||||||
|
"MYR" => 4.6,
|
||||||
|
"IDR" => 15700.0,
|
||||||
|
"PHP" => 56.0,
|
||||||
|
"VND" => 24500.0,
|
||||||
|
_ => {
|
||||||
|
// Default: assume similar to USD
|
||||||
|
1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect all exchanges from company directories and create yahoo_exchanges.json
|
||||||
|
///
|
||||||
|
/// # Features
|
||||||
|
/// - Iterates through all company directories
|
||||||
|
/// - Extracts exchange data from core/data.jsonl
|
||||||
|
/// - Groups companies by exchange
|
||||||
|
/// - Sums up market caps for each exchange
|
||||||
|
/// - Converts all market caps to USD using FX rates
|
||||||
|
/// - Saves consolidated mapping to data/yahoo_exchanges.json
|
||||||
|
/// - Handles missing or invalid data gracefully
|
||||||
|
/// - Integrity tracking with content hash validation
|
||||||
|
pub async fn collect_and_save_exchanges(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||||
|
let output_path = paths.data_dir().join("yahoo_exchanges.json");
|
||||||
|
|
||||||
|
let manager = StateManager::new(paths.integrity_dir()).await?;
|
||||||
|
let step_name = "exchange_collection_complete";
|
||||||
|
|
||||||
|
if manager.is_step_valid(step_name).await? {
|
||||||
|
logger::log_info(" Exchange collection already completed and valid").await;
|
||||||
|
|
||||||
|
// Load and count exchanges
|
||||||
|
if output_path.exists() {
|
||||||
|
let content = fs::read_to_string(&output_path).await?;
|
||||||
|
let exchanges: HashMap<String, ExchangeInfo> = serde_json::from_str(&content)?;
|
||||||
|
logger::log_info(&format!(" ✓ Found {} valid exchanges", exchanges.len())).await;
|
||||||
|
return Ok(exchanges.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let entry = create_exchange_collection_state_entry(&manager, &output_path, step_name).await?;
|
||||||
|
logger::log_info("Collecting exchange information from company directories...").await;
|
||||||
|
|
||||||
|
let corporate_dir = paths.corporate_dir();
|
||||||
|
|
||||||
|
if !corporate_dir.exists() {
|
||||||
|
logger::log_warn(" Corporate directory does not exist").await;
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load FX rates for currency conversion
|
||||||
|
logger::log_info("Loading FX rates for currency conversion...").await;
|
||||||
|
let fx_cache = FxRateCache::new(paths).await?;
|
||||||
|
|
||||||
|
// Map of exchange code -> ExchangeInfo
|
||||||
|
let mut exchanges: HashMap<String, ExchangeInfo> = HashMap::new();
|
||||||
|
|
||||||
|
let mut entries = fs::read_dir(&corporate_dir).await?;
|
||||||
|
let mut processed_count = 0;
|
||||||
|
let mut skipped_count = 0;
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let company_path = entry.path();
|
||||||
|
|
||||||
|
if !company_path.is_dir() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let company_name = match company_path.file_name().and_then(|n| n.to_str()) {
|
||||||
|
Some(name) => name.to_string(),
|
||||||
|
None => {
|
||||||
|
skipped_count += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Read core/data.jsonl
|
||||||
|
let core_data_path = company_path.join("core").join("data.jsonl");
|
||||||
|
|
||||||
|
if !core_data_path.exists() {
|
||||||
|
skipped_count += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse core data
|
||||||
|
match extract_exchange_info(&core_data_path, &company_name).await {
|
||||||
|
Ok(Some((exchange_code, exchange_name, currency, currency_symbol, delay, market_cap))) => {
|
||||||
|
// Convert market cap to USD
|
||||||
|
let market_cap_usd = fx_cache.to_usd(market_cap, ¤cy);
|
||||||
|
|
||||||
|
// Add or update exchange entry
|
||||||
|
exchanges
|
||||||
|
.entry(exchange_code.clone())
|
||||||
|
.and_modify(|info| {
|
||||||
|
// Add company to existing exchange and sum market caps
|
||||||
|
info.companies.push(company_name.clone());
|
||||||
|
info.total_market_cap = info.total_market_cap.saturating_add(market_cap);
|
||||||
|
info.total_market_cap_usd += market_cap_usd;
|
||||||
|
})
|
||||||
|
.or_insert_with(|| {
|
||||||
|
// Create new exchange entry
|
||||||
|
ExchangeInfo {
|
||||||
|
exchange_name,
|
||||||
|
currency,
|
||||||
|
currency_symbol,
|
||||||
|
exchange_data_delayed_by: delay,
|
||||||
|
total_market_cap: market_cap,
|
||||||
|
total_market_cap_usd: market_cap_usd,
|
||||||
|
companies: vec![company_name.clone()],
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
processed_count += 1;
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
// No exchange data found
|
||||||
|
skipped_count += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
" Failed to parse exchange data for {}: {}",
|
||||||
|
company_name, e
|
||||||
|
)).await;
|
||||||
|
skipped_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Progress logging every 100 companies
|
||||||
|
if (processed_count + skipped_count) % 100 == 0 {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" Progress: {} companies processed, {} skipped",
|
||||||
|
processed_count, skipped_count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" ✓ Collected data from {} companies ({} skipped)",
|
||||||
|
processed_count, skipped_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" ✓ Found {} unique exchanges",
|
||||||
|
exchanges.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Sort companies within each exchange for consistency
|
||||||
|
for exchange_info in exchanges.values_mut() {
|
||||||
|
exchange_info.companies.sort();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save to yahoo_exchanges.json
|
||||||
|
save_exchanges_json(&output_path, &exchanges).await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" ✓ Saved exchange mapping to {}",
|
||||||
|
output_path.display()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
manager.mark_valid(entry).await?;
|
||||||
|
logger::log_info(" ✓ Exchange collection marked as complete with integrity tracking").await;
|
||||||
|
|
||||||
|
// Print summary statistics
|
||||||
|
print_exchange_statistics(&exchanges, &fx_cache).await;
|
||||||
|
|
||||||
|
Ok(exchanges.len())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Track exchange collection completion with content hash verification
|
||||||
|
async fn create_exchange_collection_state_entry(
|
||||||
|
manager: &StateManager,
|
||||||
|
output_path: &std::path::Path,
|
||||||
|
step_name: &str,
|
||||||
|
) -> anyhow::Result<StateEntry> {
|
||||||
|
// Create content reference for the output file
|
||||||
|
let content_reference = file_reference(output_path);
|
||||||
|
|
||||||
|
// Track completion with:
|
||||||
|
// - Content reference: The yahoo_exchanges.json file
|
||||||
|
// - Data stage: Data (7-day TTL by default)
|
||||||
|
// - Dependencies: None (this is a collection step, not dependent on other tracked steps)
|
||||||
|
// Note: In practice, it depends on core data, but we track the output file
|
||||||
|
// which will change if core data changes, so explicit dependency not needed
|
||||||
|
Ok(manager.create_entry(
|
||||||
|
step_name.to_string(),
|
||||||
|
content_reference,
|
||||||
|
DataStage::Data,
|
||||||
|
).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract exchange information from a company's core data file
|
||||||
|
async fn extract_exchange_info(
|
||||||
|
core_data_path: &std::path::Path,
|
||||||
|
company_name: &str,
|
||||||
|
) -> anyhow::Result<Option<(String, String, String, String, i64, u64)>> {
|
||||||
|
let content = fs::read_to_string(core_data_path).await?;
|
||||||
|
|
||||||
|
// Parse JSONL - should be single line
|
||||||
|
for line in content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyCoreData>(line) {
|
||||||
|
Ok(data) => {
|
||||||
|
// Extract from modules.price
|
||||||
|
let price_module = match data.modules.and_then(|m| m.price) {
|
||||||
|
Some(p) => p,
|
||||||
|
None => return Ok(None),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract required fields
|
||||||
|
let exchange = match price_module.exchange {
|
||||||
|
Some(e) if !e.is_empty() => e,
|
||||||
|
_ => return Ok(None),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Filter out invalid placeholder exchange codes
|
||||||
|
if exchange == "CCC" {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let exchange_name = price_module.exchange_name.unwrap_or_else(|| exchange.clone());
|
||||||
|
let currency = price_module.currency.unwrap_or_else(|| "USD".to_string());
|
||||||
|
let currency_symbol = price_module.currency_symbol.unwrap_or_else(|| "$".to_string());
|
||||||
|
let delay = price_module.exchange_data_delayed_by.unwrap_or(0);
|
||||||
|
let market_cap = price_module
|
||||||
|
.market_cap
|
||||||
|
.and_then(|mc| mc.raw)
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
return Ok(Some((
|
||||||
|
exchange,
|
||||||
|
exchange_name,
|
||||||
|
currency,
|
||||||
|
currency_symbol,
|
||||||
|
delay,
|
||||||
|
market_cap,
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Try to parse as generic JSON to check if exchange field exists in modules.price
|
||||||
|
if let Ok(json) = serde_json::from_str::<serde_json::Value>(line) {
|
||||||
|
// Try to access modules.price.exchange
|
||||||
|
if let Some(price) = json.get("modules").and_then(|m| m.get("price")) {
|
||||||
|
if let Some(exchange) = price.get("exchange").and_then(|v| v.as_str()) {
|
||||||
|
if !exchange.is_empty() && exchange != "CCC" {
|
||||||
|
let exchange_name = price
|
||||||
|
.get("exchangeName")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or(exchange)
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let currency = price
|
||||||
|
.get("currency")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("USD")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let currency_symbol = price
|
||||||
|
.get("currencySymbol")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("$")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let delay = price
|
||||||
|
.get("exchangeDataDelayedBy")
|
||||||
|
.and_then(|v| v.as_i64())
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
let market_cap = price
|
||||||
|
.get("marketCap")
|
||||||
|
.and_then(|mc| mc.get("raw"))
|
||||||
|
.and_then(|v| v.as_u64())
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
return Ok(Some((
|
||||||
|
exchange.to_string(),
|
||||||
|
exchange_name,
|
||||||
|
currency,
|
||||||
|
currency_symbol,
|
||||||
|
delay,
|
||||||
|
market_cap,
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Err(anyhow::anyhow!(
|
||||||
|
"Failed to parse core data for {}: {}",
|
||||||
|
company_name,
|
||||||
|
e
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save exchanges map to JSON file with fsync
|
||||||
|
async fn save_exchanges_json(
|
||||||
|
path: &std::path::Path,
|
||||||
|
exchanges: &HashMap<String, ExchangeInfo>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
// Create sorted output for consistency
|
||||||
|
let mut sorted_exchanges: Vec<_> = exchanges.iter().collect();
|
||||||
|
sorted_exchanges.sort_by_key(|(code, _)| code.as_str());
|
||||||
|
|
||||||
|
let exchanges_map: HashMap<String, ExchangeInfo> = sorted_exchanges
|
||||||
|
.into_iter()
|
||||||
|
.map(|(k, v)| (k.clone(), v.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Serialize with pretty printing
|
||||||
|
let json_content = serde_json::to_string_pretty(&exchanges_map)?;
|
||||||
|
|
||||||
|
// Write to temporary file first (atomic write pattern)
|
||||||
|
let tmp_path = path.with_extension("json.tmp");
|
||||||
|
let mut file = fs::File::create(&tmp_path).await?;
|
||||||
|
file.write_all(json_content.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
file.sync_all().await?;
|
||||||
|
|
||||||
|
// Atomic rename
|
||||||
|
fs::rename(&tmp_path, path).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format market cap as a human-readable string
|
||||||
|
fn format_market_cap(market_cap: f64) -> String {
|
||||||
|
if market_cap >= 1_000_000_000_000.0 {
|
||||||
|
format!("{:.2}T", market_cap / 1_000_000_000_000.0)
|
||||||
|
} else if market_cap >= 1_000_000_000.0 {
|
||||||
|
format!("{:.2}B", market_cap / 1_000_000_000.0)
|
||||||
|
} else if market_cap >= 1_000_000.0 {
|
||||||
|
format!("{:.2}M", market_cap / 1_000_000.0)
|
||||||
|
} else if market_cap >= 1_000.0 {
|
||||||
|
format!("{:.2}K", market_cap / 1_000.0)
|
||||||
|
} else {
|
||||||
|
format!("{:.2}", market_cap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Print statistics about collected exchanges
|
||||||
|
async fn print_exchange_statistics(exchanges: &HashMap<String, ExchangeInfo>, fx_cache: &FxRateCache) {
|
||||||
|
logger::log_info("Exchange Statistics (sorted by USD market cap):").await;
|
||||||
|
|
||||||
|
// Sort by total market cap in USD (descending)
|
||||||
|
let mut exchange_list: Vec<_> = exchanges.iter().collect();
|
||||||
|
exchange_list.sort_by(|a, b| {
|
||||||
|
b.1.total_market_cap_usd
|
||||||
|
.partial_cmp(&a.1.total_market_cap_usd)
|
||||||
|
.unwrap_or(std::cmp::Ordering::Equal)
|
||||||
|
});
|
||||||
|
|
||||||
|
// Print top 20 exchanges by total market cap (USD)
|
||||||
|
logger::log_info(" Top 20 exchanges by total market cap (USD):").await;
|
||||||
|
for (i, (code, info)) in exchange_list.iter().take(20).enumerate() {
|
||||||
|
let (normalized_currency, factor) = normalize_currency(&info.currency);
|
||||||
|
let fx_rate = fx_cache.get_rate(&info.currency);
|
||||||
|
|
||||||
|
let fx_info = match fx_rate {
|
||||||
|
Some(rate) => {
|
||||||
|
if factor > 1.0 {
|
||||||
|
// Show conversion for pence/cents
|
||||||
|
format!(" (1 {} = {} {}, {} {} = 1 {})",
|
||||||
|
normalized_currency,
|
||||||
|
format!("{:.4}", rate),
|
||||||
|
"USD",
|
||||||
|
factor as i32,
|
||||||
|
info.currency,
|
||||||
|
normalized_currency)
|
||||||
|
} else {
|
||||||
|
format!(" (1 USD = {:.4} {})", rate, info.currency)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => format!(" (using fallback rate for {})", info.currency),
|
||||||
|
};
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" {}. {} ({}) - ${} USD ({}{} {}) - {} companies{}",
|
||||||
|
i + 1,
|
||||||
|
info.exchange_name,
|
||||||
|
code,
|
||||||
|
format_market_cap(info.total_market_cap_usd),
|
||||||
|
info.currency_symbol,
|
||||||
|
format_market_cap(info.total_market_cap as f64),
|
||||||
|
info.currency,
|
||||||
|
info.companies.len(),
|
||||||
|
if info.currency != "USD" { &fx_info } else { "" }
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count by currency
|
||||||
|
let mut currency_counts: HashMap<String, usize> = HashMap::new();
|
||||||
|
let mut currency_market_caps: HashMap<String, f64> = HashMap::new();
|
||||||
|
for info in exchanges.values() {
|
||||||
|
*currency_counts.entry(info.currency.clone()).or_insert(0) += info.companies.len();
|
||||||
|
*currency_market_caps.entry(info.currency.clone()).or_insert(0.0) += info.total_market_cap_usd;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut currencies: Vec<_> = currency_counts.iter().collect();
|
||||||
|
currencies.sort_by(|a, b| {
|
||||||
|
currency_market_caps.get(b.0)
|
||||||
|
.unwrap_or(&0.0)
|
||||||
|
.partial_cmp(currency_market_caps.get(a.0).unwrap_or(&0.0))
|
||||||
|
.unwrap_or(std::cmp::Ordering::Equal)
|
||||||
|
});
|
||||||
|
|
||||||
|
logger::log_info(" Market cap by currency (USD equivalent):").await;
|
||||||
|
for (currency, count) in currencies.iter().take(10) {
|
||||||
|
let market_cap_usd = currency_market_caps.get(*currency).unwrap_or(&0.0);
|
||||||
|
let (normalized_currency, factor) = normalize_currency(currency);
|
||||||
|
let fx_rate = fx_cache.get_rate(currency);
|
||||||
|
|
||||||
|
let fx_info = match fx_rate {
|
||||||
|
Some(rate) => {
|
||||||
|
if factor > 1.0 {
|
||||||
|
format!(" (1 {} = {:.4} USD, {} {} = 1 {})",
|
||||||
|
normalized_currency, rate, factor as i32, currency, normalized_currency)
|
||||||
|
} else {
|
||||||
|
format!(" (1 USD = {:.4} {})", rate, currency)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => format!(" (fallback)"),
|
||||||
|
};
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" {}: {} companies, ${} USD{}",
|
||||||
|
currency,
|
||||||
|
count,
|
||||||
|
format_market_cap(*market_cap_usd),
|
||||||
|
if *currency != "USD" { &fx_info } else { "" }
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delay statistics
|
||||||
|
let delayed_exchanges: Vec<_> = exchanges
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, info)| info.exchange_data_delayed_by > 0)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !delayed_exchanges.is_empty() {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" Exchanges with data delay: {} (out of {})",
|
||||||
|
delayed_exchanges.len(),
|
||||||
|
exchanges.len()
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total market cap across all exchanges (in USD)
|
||||||
|
let total_market_cap_usd: f64 = exchanges.values()
|
||||||
|
.map(|info| info.total_market_cap_usd)
|
||||||
|
.sum();
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" Total market cap across all exchanges: ${} USD",
|
||||||
|
format_market_cap(total_market_cap_usd)
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get exchange information for a specific exchange code
|
||||||
|
pub async fn get_exchange_info(
|
||||||
|
paths: &DataPaths,
|
||||||
|
exchange_code: &str,
|
||||||
|
) -> anyhow::Result<Option<ExchangeInfo>> {
|
||||||
|
let exchanges_path = paths.data_dir().join("yahoo_exchanges.json");
|
||||||
|
|
||||||
|
if !exchanges_path.exists() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = fs::read_to_string(&exchanges_path).await?;
|
||||||
|
let exchanges: HashMap<String, ExchangeInfo> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
Ok(exchanges.get(exchange_code).cloned())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List all available exchanges
|
||||||
|
pub async fn list_all_exchanges(paths: &DataPaths) -> anyhow::Result<Vec<(String, ExchangeInfo)>> {
|
||||||
|
let exchanges_path = paths.data_dir().join("yahoo_exchanges.json");
|
||||||
|
|
||||||
|
if !exchanges_path.exists() {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = fs::read_to_string(&exchanges_path).await?;
|
||||||
|
let exchanges: HashMap<String, ExchangeInfo> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
let mut exchange_list: Vec<_> = exchanges.into_iter().collect();
|
||||||
|
exchange_list.sort_by(|a, b| a.0.cmp(&b.0));
|
||||||
|
|
||||||
|
Ok(exchange_list)
|
||||||
|
}
|
||||||
@@ -1,51 +0,0 @@
|
|||||||
// src/corporate/fx.rs
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use reqwest;
|
|
||||||
use serde_json::Value;
|
|
||||||
use tokio::fs;
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
static FX_CACHE_PATH: &str = "fx_rates.json";
|
|
||||||
|
|
||||||
pub async fn get_usd_rate(currency: &str) -> anyhow::Result<f64> {
|
|
||||||
if currency == "USD" {
|
|
||||||
return Ok(1.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut cache: HashMap<String, (f64, String)> = if Path::new(FX_CACHE_PATH).exists() {
|
|
||||||
let content = fs::read_to_string(FX_CACHE_PATH).await?;
|
|
||||||
serde_json::from_str(&content).unwrap_or_default()
|
|
||||||
} else {
|
|
||||||
HashMap::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
|
|
||||||
if let Some((rate, date)) = cache.get(currency) {
|
|
||||||
if date == &today {
|
|
||||||
return Ok(*rate);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let symbol = format!("{}USD=X", currency);
|
|
||||||
let url = format!("https://query1.finance.yahoo.com/v8/finance/chart/{}?range=1d&interval=1d", symbol);
|
|
||||||
|
|
||||||
let json: Value = reqwest::Client::new()
|
|
||||||
.get(&url)
|
|
||||||
.header("User-Agent", "Mozilla/5.0")
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let close = json["chart"]["result"][0]["meta"]["regularMarketPrice"]
|
|
||||||
.as_f64()
|
|
||||||
.or_else(|| json["chart"]["result"][0]["indicators"]["quote"][0]["close"][0].as_f64())
|
|
||||||
.unwrap_or(1.0);
|
|
||||||
|
|
||||||
let rate = if currency == "JPY" || currency == "KRW" { close } else { 1.0 / close }; // inverse pairs
|
|
||||||
|
|
||||||
cache.insert(currency.to_string(), (rate, today.clone()));
|
|
||||||
let _ = fs::write(FX_CACHE_PATH, serde_json::to_string_pretty(&cache)?).await;
|
|
||||||
|
|
||||||
Ok(rate)
|
|
||||||
}
|
|
||||||
@@ -1,20 +1,25 @@
|
|||||||
// src/corporate/helpers.rs
|
// src/corporate/helpers.rs
|
||||||
use super::types::*;
|
use super::types::*;
|
||||||
use chrono::{Local, NaiveDate};
|
use crate::util::directories::DataPaths;
|
||||||
use std::collections::{HashMap, HashSet};
|
|
||||||
|
|
||||||
pub fn event_key(e: &CompanyEvent) -> String {
|
use chrono::{Local, NaiveDate};
|
||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::prelude::{Rng, SeedableRng, IndexedRandom};
|
||||||
|
use tokio::fs;
|
||||||
|
use anyhow::{anyhow};
|
||||||
|
|
||||||
|
pub fn event_key(e: &CompanyEventData) -> String {
|
||||||
format!("{}|{}|{}", e.ticker, e.date, e.time)
|
format!("{}|{}|{}", e.ticker, e.date, e.time)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Vec<CompanyEventChange> {
|
pub fn detect_changes(old: &CompanyEventData, new: &CompanyEventData, today: &str) -> Vec<CompanyEventChangeData> {
|
||||||
let mut changes = Vec::new();
|
let mut changes = Vec::new();
|
||||||
let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
|
let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
|
||||||
|
|
||||||
if new.date.as_str() <= today { return changes; }
|
if new.date.as_str() <= today { return changes; }
|
||||||
|
|
||||||
if old.time != new.time {
|
if old.time != new.time {
|
||||||
changes.push(CompanyEventChange {
|
changes.push(CompanyEventChangeData {
|
||||||
ticker: new.ticker.clone(),
|
ticker: new.ticker.clone(),
|
||||||
date: new.date.clone(),
|
date: new.date.clone(),
|
||||||
field_changed: "time".to_string(),
|
field_changed: "time".to_string(),
|
||||||
@@ -25,7 +30,7 @@ pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Ve
|
|||||||
}
|
}
|
||||||
|
|
||||||
if old.eps_forecast != new.eps_forecast {
|
if old.eps_forecast != new.eps_forecast {
|
||||||
changes.push(CompanyEventChange {
|
changes.push(CompanyEventChangeData {
|
||||||
ticker: new.ticker.clone(),
|
ticker: new.ticker.clone(),
|
||||||
date: new.date.clone(),
|
date: new.date.clone(),
|
||||||
field_changed: "eps_forecast".to_string(),
|
field_changed: "eps_forecast".to_string(),
|
||||||
@@ -36,7 +41,7 @@ pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Ve
|
|||||||
}
|
}
|
||||||
|
|
||||||
if old.eps_actual != new.eps_actual {
|
if old.eps_actual != new.eps_actual {
|
||||||
changes.push(CompanyEventChange {
|
changes.push(CompanyEventChangeData {
|
||||||
ticker: new.ticker.clone(),
|
ticker: new.ticker.clone(),
|
||||||
date: new.date.clone(),
|
date: new.date.clone(),
|
||||||
field_changed: "eps_actual".to_string(),
|
field_changed: "eps_actual".to_string(),
|
||||||
@@ -51,14 +56,6 @@ pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Ve
|
|||||||
changes
|
changes
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn price_key(p: &CompanyPrice) -> String {
|
|
||||||
if p.time.is_empty() {
|
|
||||||
format!("{}|{}", p.ticker, p.date)
|
|
||||||
} else {
|
|
||||||
format!("{}|{}|{}", p.ticker, p.date, p.time)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn parse_float(s: &str) -> Option<f64> {
|
pub fn parse_float(s: &str) -> Option<f64> {
|
||||||
s.replace("--", "").replace(",", "").parse::<f64>().ok()
|
s.replace("--", "").replace(",", "").parse::<f64>().ok()
|
||||||
}
|
}
|
||||||
@@ -68,3 +65,120 @@ pub fn parse_yahoo_date(s: &str) -> anyhow::Result<NaiveDate> {
|
|||||||
.or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y"))
|
.or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y"))
|
||||||
.map_err(|_| anyhow::anyhow!("Bad date: {s}"))
|
.map_err(|_| anyhow::anyhow!("Bad date: {s}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Send-safe random range
|
||||||
|
pub fn random_range(min: u64, max: u64) -> u64 {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
rng.random_range(min..max)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send-safe random choice
|
||||||
|
pub fn choose_random<T: Clone>(items: &[T]) -> T {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
items.choose(&mut rng).unwrap().clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract first valid Yahoo ticker from company
|
||||||
|
pub fn extract_first_yahoo_ticker(company: &CompanyData) -> Option<String> {
|
||||||
|
if let Some(isin_tickers_map) = &company.isin_tickers_map {
|
||||||
|
for tickers in isin_tickers_map.values() {
|
||||||
|
for ticker in tickers {
|
||||||
|
if ticker.starts_with("YAHOO:")
|
||||||
|
&& ticker != "YAHOO:NO_RESULTS"
|
||||||
|
&& ticker != "YAHOO:ERROR"
|
||||||
|
{
|
||||||
|
return Some(ticker.trim_start_matches("YAHOO:").to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sanitize company name for file system use
|
||||||
|
pub fn sanitize_company_name(name: &str) -> String {
|
||||||
|
name.replace("/", "_")
|
||||||
|
.replace("\\", "_")
|
||||||
|
.replace(":", "_")
|
||||||
|
.replace("*", "_")
|
||||||
|
.replace("?", "_")
|
||||||
|
.replace("\"", "_")
|
||||||
|
.replace("<", "_")
|
||||||
|
.replace(">", "_")
|
||||||
|
.replace("|", "_")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load companies from JSONL file
|
||||||
|
pub async fn load_companies_from_jsonl(
|
||||||
|
path: &std::path::Path
|
||||||
|
) -> anyhow::Result<Vec<CompanyData>> {
|
||||||
|
let content = tokio::fs::read_to_string(path).await?;
|
||||||
|
let mut companies = Vec::new();
|
||||||
|
|
||||||
|
for line in content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Ok(company) = serde_json::from_str::<CompanyData>(line) {
|
||||||
|
companies.push(company);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(companies)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
||||||
|
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
||||||
|
|
||||||
|
if !map_cache_dir.exists() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut entries = tokio::fs::read_dir(&map_cache_dir).await?;
|
||||||
|
let mut dates = Vec::new();
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_dir() {
|
||||||
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||||
|
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
|
||||||
|
dates.push((name.to_string(), path));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if dates.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
dates.sort_by(|a, b| b.0.cmp(&a.0));
|
||||||
|
Ok(Some(dates[0].1.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn determine_gleif_date(
|
||||||
|
gleif_date: Option<&str>,
|
||||||
|
paths: &DataPaths,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
if let Some(d) = gleif_date {
|
||||||
|
return Ok(d.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let gleif_dir = paths.cache_gleif_dir();
|
||||||
|
let mut entries = fs::read_dir(gleif_dir).await?;
|
||||||
|
let mut dates = Vec::new();
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_dir() {
|
||||||
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||||
|
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
|
||||||
|
dates.push(name.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dates.sort();
|
||||||
|
dates.last().cloned().ok_or_else(|| anyhow!("No GLEIF date found"))
|
||||||
|
}
|
||||||
@@ -2,10 +2,20 @@
|
|||||||
pub mod types;
|
pub mod types;
|
||||||
pub mod scraper;
|
pub mod scraper;
|
||||||
pub mod storage;
|
pub mod storage;
|
||||||
pub mod update;
|
|
||||||
pub mod helpers;
|
pub mod helpers;
|
||||||
pub mod aggregation;
|
pub mod update_openfigi;
|
||||||
pub mod fx;
|
pub mod yahoo_company_extraction;
|
||||||
pub mod openfigi;
|
pub mod page_validation;
|
||||||
|
pub mod checkpoint_helpers;
|
||||||
|
|
||||||
|
// Corporate update modules
|
||||||
|
pub mod update;
|
||||||
|
pub mod update_companies;
|
||||||
|
pub mod update_companies_cleanse;
|
||||||
|
pub mod update_companies_enrich;
|
||||||
|
|
||||||
|
pub mod collect_exchanges;
|
||||||
|
pub mod bond_processing;
|
||||||
|
pub mod option_processing;
|
||||||
|
|
||||||
pub use update::run_full_update;
|
pub use update::run_full_update;
|
||||||
File diff suppressed because it is too large
Load Diff
54
src/corporate/option_processing.rs
Normal file
54
src/corporate/option_processing.rs
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
/// Parse strike price from option ticker (e.g., "AAPL 150 CALL" -> 150.0)
|
||||||
|
pub fn parse_strike_from_ticker(ticker: &str) -> Option<f64> {
|
||||||
|
let parts: Vec<&str> = ticker.split_whitespace().collect();
|
||||||
|
for (i, part) in parts.iter().enumerate() {
|
||||||
|
if let Ok(strike) = part.parse::<f64>() {
|
||||||
|
// Check if next word is CALL/PUT to confirm this is strike
|
||||||
|
if i + 1 < parts.len() && (parts[i + 1].to_uppercase() == "CALL" || parts[i + 1].to_uppercase() == "PUT") {
|
||||||
|
return Some(strike);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse expiration date from option ticker (e.g., "AAPL 150 CALL 01/17/25" -> timestamp)
|
||||||
|
pub fn parse_expiration_from_ticker(ticker: &str) -> Option<i64> {
|
||||||
|
let parts: Vec<&str> = ticker.split_whitespace().collect();
|
||||||
|
for part in parts {
|
||||||
|
// Look for date pattern MM/DD/YY
|
||||||
|
if part.contains('/') && part.len() >= 8 {
|
||||||
|
if let Ok(date) = chrono::NaiveDate::parse_from_str(part, "%m/%d/%y") {
|
||||||
|
return Some(date.and_hms_opt(16, 0, 0)?.and_utc().timestamp());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse option name to extract underlying company, issuer, and option type
|
||||||
|
///
|
||||||
|
/// Examples:
|
||||||
|
/// - "December 25 Calls on ALPHA GA" -> ("ALPHA GA", None, "call")
|
||||||
|
/// - "January 26 Puts on TESLA INC" -> ("TESLA INC", None, "put")
|
||||||
|
pub fn parse_option_name(name: &str) -> (String, Option<String>, String) {
|
||||||
|
let name_upper = name.to_uppercase();
|
||||||
|
|
||||||
|
// Detect option type
|
||||||
|
let option_type = if name_upper.contains("CALL") {
|
||||||
|
"call".to_string()
|
||||||
|
} else if name_upper.contains("PUT") {
|
||||||
|
"put".to_string()
|
||||||
|
} else {
|
||||||
|
"unknown".to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Try to extract underlying after "on"
|
||||||
|
if let Some(pos) = name_upper.find(" ON ") {
|
||||||
|
let underlying = name[pos + 4..].trim().to_string();
|
||||||
|
return (underlying, None, option_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: return entire name
|
||||||
|
(name.to_string(), None, option_type)
|
||||||
|
}
|
||||||
180
src/corporate/page_validation.rs
Normal file
180
src/corporate/page_validation.rs
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
// src/corporate/page_validation.rs
|
||||||
|
//
|
||||||
|
// Utilities to ensure page state is correct before extraction
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Result};
|
||||||
|
use fantoccini::Client;
|
||||||
|
use tokio::time::{sleep, Duration};
|
||||||
|
|
||||||
|
/// Validates that the browser navigated to the expected URL
|
||||||
|
///
|
||||||
|
/// This prevents extracting data from a stale page when navigation fails silently
|
||||||
|
pub async fn verify_navigation(
|
||||||
|
client: &Client,
|
||||||
|
expected_url_fragment: &str,
|
||||||
|
max_attempts: u32,
|
||||||
|
) -> Result<()> {
|
||||||
|
for attempt in 1..=max_attempts {
|
||||||
|
let current_url = client.current_url().await?;
|
||||||
|
let current = current_url.as_str();
|
||||||
|
|
||||||
|
if current.contains(expected_url_fragment) {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"✓ Navigation verified: {} (attempt {})",
|
||||||
|
current, attempt
|
||||||
|
)).await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
if attempt < max_attempts {
|
||||||
|
crate::util::logger::log_warn(&format!(
|
||||||
|
"Navigation mismatch (attempt {}): expected '{}', got '{}'. Retrying...",
|
||||||
|
attempt, expected_url_fragment, current
|
||||||
|
)).await;
|
||||||
|
sleep(Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let current_url = client.current_url().await?;
|
||||||
|
Err(anyhow!(
|
||||||
|
"Navigation verification failed: expected URL containing '{}', but got '{}'",
|
||||||
|
expected_url_fragment,
|
||||||
|
current_url.as_str()
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clears browser state by navigating to a blank page
|
||||||
|
///
|
||||||
|
/// Use this when a navigation fails or times out to ensure clean slate
|
||||||
|
pub async fn clear_browser_state(client: &Client) -> Result<()> {
|
||||||
|
crate::util::logger::log_info("Clearing browser state with about:blank").await;
|
||||||
|
|
||||||
|
// Navigate to blank page to clear any stale content
|
||||||
|
client.goto("about:blank").await?;
|
||||||
|
|
||||||
|
// Brief wait to ensure page clears
|
||||||
|
sleep(Duration::from_millis(200)).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validates that expected content exists on the page before extraction
|
||||||
|
///
|
||||||
|
/// This adds an extra safety check that the page actually loaded
|
||||||
|
pub async fn verify_page_content(
|
||||||
|
client: &Client,
|
||||||
|
content_checks: Vec<ContentCheck>,
|
||||||
|
) -> Result<()> {
|
||||||
|
for check in content_checks {
|
||||||
|
match check {
|
||||||
|
ContentCheck::ElementExists(selector) => {
|
||||||
|
let exists: bool = client
|
||||||
|
.execute(
|
||||||
|
&format!(
|
||||||
|
"return !!document.querySelector('{}');",
|
||||||
|
selector.replace("'", "\\'")
|
||||||
|
),
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if !exists {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Expected element '{}' not found on page",
|
||||||
|
selector
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ContentCheck::TextContains(text) => {
|
||||||
|
let page_text: String = client
|
||||||
|
.execute("return document.body.innerText;", vec![])
|
||||||
|
.await?
|
||||||
|
.as_str()
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
if !page_text.contains(&text) {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Expected text '{}' not found on page",
|
||||||
|
text
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum ContentCheck {
|
||||||
|
/// Verify that a CSS selector exists
|
||||||
|
ElementExists(String),
|
||||||
|
/// Verify that page body contains text
|
||||||
|
TextContains(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Safe navigation wrapper that validates and clears state on failure
|
||||||
|
pub async fn navigate_with_validation(
|
||||||
|
client: &Client,
|
||||||
|
url: &str,
|
||||||
|
expected_url_fragment: &str,
|
||||||
|
timeout_secs: u64,
|
||||||
|
) -> Result<()> {
|
||||||
|
use tokio::time::timeout;
|
||||||
|
|
||||||
|
// Attempt navigation with timeout
|
||||||
|
let nav_result = timeout(
|
||||||
|
Duration::from_secs(timeout_secs),
|
||||||
|
client.goto(url)
|
||||||
|
).await;
|
||||||
|
|
||||||
|
match nav_result {
|
||||||
|
Ok(Ok(_)) => {
|
||||||
|
// Navigation succeeded, verify we're on correct page
|
||||||
|
verify_navigation(client, expected_url_fragment, 3).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
// Navigation failed - clear state before returning error
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Navigation failed: {}. Clearing browser state...",
|
||||||
|
e
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(client).await.ok(); // Best effort
|
||||||
|
Err(anyhow!("Navigation failed: {}", e))
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Navigation timed out - clear state before returning error
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"Navigation timeout after {}s. Clearing browser state...",
|
||||||
|
timeout_secs
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(client).await.ok(); // Best effort
|
||||||
|
Err(anyhow!("Navigation timeout"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_content_check_variants() {
|
||||||
|
let check1 = ContentCheck::ElementExists("table".to_string());
|
||||||
|
let check2 = ContentCheck::TextContains("Yahoo Finance".to_string());
|
||||||
|
|
||||||
|
match check1 {
|
||||||
|
ContentCheck::ElementExists(sel) => assert_eq!(sel, "table"),
|
||||||
|
_ => panic!("Wrong variant"),
|
||||||
|
}
|
||||||
|
|
||||||
|
match check2 {
|
||||||
|
ContentCheck::TextContains(text) => assert_eq!(text, "Yahoo Finance"),
|
||||||
|
_ => panic!("Wrong variant"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,659 +1,13 @@
|
|||||||
// src/corporate/scraper.rs
|
// src/corporate/scraper.rs
|
||||||
use super::{types::*, helpers::*, openfigi::*};
|
use crate::{util::directories::DataPaths, util::logger};
|
||||||
//use crate::corporate::openfigi::OpenFigiClient;
|
use fantoccini::{Client};
|
||||||
use crate::{scraper::webdriver::*};
|
|
||||||
use fantoccini::{Client, Locator};
|
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
use chrono::{DateTime, Duration, NaiveDate, Utc};
|
|
||||||
use tokio::{time::{Duration as TokioDuration, sleep}};
|
|
||||||
use reqwest::Client as HttpClient;
|
|
||||||
use serde_json::{json, Value};
|
|
||||||
use zip::ZipArchive;
|
use zip::ZipArchive;
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap};
|
||||||
use std::io::{Read};
|
use std::io::{Read};
|
||||||
use anyhow::{anyhow, Result};
|
|
||||||
|
|
||||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
||||||
|
|
||||||
/// Discover all exchanges where this ISIN trades by querying Yahoo Finance and enriching with OpenFIGI API calls.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `isin` - The ISIN to search for.
|
|
||||||
/// * `known_ticker` - A known ticker symbol for fallback or initial check.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of FigiInfo structs containing enriched data from API calls.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if HTTP requests fail, JSON parsing fails, or OpenFIGI API responds with an error.
|
|
||||||
pub async fn discover_available_exchanges(isin: &str, known_ticker: &str) -> anyhow::Result<Vec<FigiInfo>> {
|
|
||||||
println!(" Discovering exchanges for ISIN {}", isin);
|
|
||||||
|
|
||||||
let mut potential: Vec<(String, PrimaryInfo)> = Vec::new();
|
|
||||||
|
|
||||||
// Try the primary ticker first
|
|
||||||
if let Ok(info) = check_ticker_exists(known_ticker).await {
|
|
||||||
potential.push((known_ticker.to_string(), info));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Search for ISIN directly on Yahoo to find other listings
|
|
||||||
let search_url = format!(
|
|
||||||
"https://query2.finance.yahoo.com/v1/finance/search?q={}"esCount=20&newsCount=0",
|
|
||||||
isin
|
|
||||||
);
|
|
||||||
|
|
||||||
let resp = HttpClient::new()
|
|
||||||
.get(&search_url)
|
|
||||||
.header("User-Agent", USER_AGENT)
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let json = resp.json::<Value>().await?;
|
|
||||||
|
|
||||||
if let Some(quotes) = json["quotes"].as_array() {
|
|
||||||
for quote in quotes {
|
|
||||||
// First: filter by quoteType directly from search results (faster rejection)
|
|
||||||
let quote_type = quote["quoteType"].as_str().unwrap_or("");
|
|
||||||
if quote_type.to_uppercase() != "EQUITY" {
|
|
||||||
continue; // Skip bonds, ETFs, mutual funds, options, etc.
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(symbol) = quote["symbol"].as_str() {
|
|
||||||
// Avoid duplicates
|
|
||||||
if potential.iter().any(|(s, _)| s == symbol) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Double-check with full quote data (some search results are misleading)
|
|
||||||
if let Ok(info) = check_ticker_exists(symbol).await {
|
|
||||||
potential.push((symbol.to_string(), info));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if potential.is_empty() {
|
|
||||||
return Ok(vec![]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enrich with OpenFIGI API
|
|
||||||
let client = OpenFigiClient::new()?;
|
|
||||||
|
|
||||||
let mut discovered_figis = Vec::new();
|
|
||||||
|
|
||||||
if !client.has_key() {
|
|
||||||
// Fallback without API key - create FigiInfo with default/empty fields
|
|
||||||
for (symbol, info) in potential {
|
|
||||||
println!(" Found equity listing: {} on {} ({}) - no FIGI (fallback mode)", symbol, info.exchange_mic, info.currency);
|
|
||||||
let figi_info = FigiInfo {
|
|
||||||
isin: info.isin,
|
|
||||||
figi: String::new(),
|
|
||||||
name: info.name,
|
|
||||||
ticker: symbol,
|
|
||||||
mic_code: info.exchange_mic,
|
|
||||||
currency: info.currency,
|
|
||||||
compositeFIGI: String::new(),
|
|
||||||
securityType: String::new(),
|
|
||||||
marketSector: String::new(),
|
|
||||||
shareClassFIGI: String::new(),
|
|
||||||
securityType2: String::new(),
|
|
||||||
securityDescription: String::new(),
|
|
||||||
};
|
|
||||||
discovered_figis.push(figi_info);
|
|
||||||
}
|
|
||||||
return Ok(discovered_figis);
|
|
||||||
}
|
|
||||||
|
|
||||||
// With API key, batch the mapping requests
|
|
||||||
let chunk_size = 100;
|
|
||||||
for chunk in potential.chunks(chunk_size) {
|
|
||||||
let mut jobs = vec![];
|
|
||||||
for (symbol, info) in chunk {
|
|
||||||
jobs.push(json!({
|
|
||||||
"idType": "TICKER",
|
|
||||||
"idValue": symbol,
|
|
||||||
"micCode": info.exchange_mic,
|
|
||||||
"marketSecDes": "Equity",
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
let resp = client.get_figi_client()
|
|
||||||
.post("https://api.openfigi.com/v3/mapping")
|
|
||||||
.header("Content-Type", "application/json")
|
|
||||||
.json(&jobs)
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
return Err(anyhow::anyhow!("OpenFIGI mapping failed with status: {}", resp.status()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsed: Vec<Value> = resp.json().await?;
|
|
||||||
|
|
||||||
for (i, item) in parsed.iter().enumerate() {
|
|
||||||
let (symbol, info) = &chunk[i];
|
|
||||||
if let Some(data) = item["data"].as_array() {
|
|
||||||
if let Some(entry) = data.first() {
|
|
||||||
let market_sec = entry["marketSector"].as_str().unwrap_or("");
|
|
||||||
if market_sec != "Equity" {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
println!(" Found equity listing: {} on {} ({}) - FIGI: {}", symbol, info.exchange_mic, info.currency, entry["figi"]);
|
|
||||||
let figi_info = FigiInfo {
|
|
||||||
isin: info.isin.clone(),
|
|
||||||
figi: entry["figi"].as_str().unwrap_or("").to_string(),
|
|
||||||
name: entry["name"].as_str().unwrap_or(&info.name).to_string(),
|
|
||||||
ticker: symbol.clone(),
|
|
||||||
mic_code: info.exchange_mic.clone(),
|
|
||||||
currency: info.currency.clone(),
|
|
||||||
compositeFIGI: entry["compositeFIGI"].as_str().unwrap_or("").to_string(),
|
|
||||||
securityType: entry["securityType"].as_str().unwrap_or("").to_string(),
|
|
||||||
marketSector: market_sec.to_string(),
|
|
||||||
shareClassFIGI: entry["shareClassFIGI"].as_str().unwrap_or("").to_string(),
|
|
||||||
securityType2: entry["securityType2"].as_str().unwrap_or("").to_string(),
|
|
||||||
securityDescription: entry["securityDescription"].as_str().unwrap_or("").to_string(),
|
|
||||||
};
|
|
||||||
discovered_figis.push(figi_info);
|
|
||||||
} else {
|
|
||||||
println!(" No data returned for ticker {} on MIC {}", symbol, info.exchange_mic);
|
|
||||||
}
|
|
||||||
} else if let Some(error) = item["error"].as_str() {
|
|
||||||
println!(" OpenFIGI error for ticker {}: {}", symbol, error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Respect rate limit (6 seconds between requests with key)
|
|
||||||
sleep(TokioDuration::from_secs(6)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(discovered_figis)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if a ticker exists on Yahoo Finance and return core metadata.
|
|
||||||
///
|
|
||||||
/// This function calls the public Yahoo Finance quoteSummary endpoint and extracts:
|
|
||||||
/// - ISIN (when available)
|
|
||||||
/// - Company name
|
|
||||||
/// - Exchange MIC code
|
|
||||||
/// - Trading currency
|
|
||||||
///
|
|
||||||
/// It strictly filters to only accept **equity** securities.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `ticker` - The ticker symbol to validate (e.g., "AAPL", "7203.T", "BMW.DE")
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// `Ok(PrimaryInfo)` on success, `Err` if ticker doesn't exist, is not equity, or data is malformed.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// - Ticker not found
|
|
||||||
/// - Not an equity (ETF, bond, etc.)
|
|
||||||
/// - Missing critical fields
|
|
||||||
/// - Network or JSON parsing errors
|
|
||||||
pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
|
||||||
let url = format!(
|
|
||||||
"https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}?modules=price%2CassetProfile",
|
|
||||||
ticker
|
|
||||||
);
|
|
||||||
|
|
||||||
let resp = match HttpClient::new()
|
|
||||||
.get(&url)
|
|
||||||
.header("User-Agent", USER_AGENT)
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(resp) => resp,
|
|
||||||
Err(err) => {
|
|
||||||
return Err(anyhow::anyhow!(
|
|
||||||
"Failed to reach Yahoo Finance for ticker {}: {}",
|
|
||||||
ticker,
|
|
||||||
err
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
return Err(anyhow::anyhow!("Yahoo returned HTTP {} for ticker {}", resp.status(), ticker));
|
|
||||||
}
|
|
||||||
|
|
||||||
let json: Value = match resp
|
|
||||||
.json()
|
|
||||||
.await {
|
|
||||||
Ok(resp) => resp,
|
|
||||||
Err(err) => {
|
|
||||||
return Err(anyhow::anyhow!(
|
|
||||||
"Failed to parse JSON response from Yahoo Finance {}: {}",
|
|
||||||
ticker,
|
|
||||||
err
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let result_array = json["quoteSummary"]["result"]
|
|
||||||
.as_array()
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("Missing 'quoteSummary.result' in response"))?;
|
|
||||||
|
|
||||||
if result_array.is_empty() || result_array[0].is_null() {
|
|
||||||
return Err(anyhow::anyhow!("No quote data returned for ticker {}", ticker));
|
|
||||||
}
|
|
||||||
|
|
||||||
let quote = &result_array[0]["price"];
|
|
||||||
let profile = &result_array[0]["assetProfile"];
|
|
||||||
|
|
||||||
// === 1. Must be EQUITY ===
|
|
||||||
let quote_type = quote["quoteType"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_ascii_uppercase();
|
|
||||||
|
|
||||||
if quote_type != "EQUITY" {
|
|
||||||
println!(" → Skipping {} (quoteType: {})", ticker, quote_type);
|
|
||||||
return Err(anyhow::anyhow!("Not an equity security: {}", quote_type));
|
|
||||||
}
|
|
||||||
|
|
||||||
// === 2. Extract basic info ===
|
|
||||||
let long_name = quote["longName"]
|
|
||||||
.as_str()
|
|
||||||
.or_else(|| quote["shortName"].as_str())
|
|
||||||
.unwrap_or(ticker)
|
|
||||||
.trim()
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let currency = quote["currency"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap_or("USD")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let exchange_mic = quote["exchange"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
if exchange_mic.is_empty() {
|
|
||||||
return Err(anyhow::anyhow!("Missing exchange MIC for ticker {}", ticker));
|
|
||||||
}
|
|
||||||
|
|
||||||
// === 3. Extract ISIN (from assetProfile if available) ===
|
|
||||||
let isin = profile["isin"]
|
|
||||||
.as_str()
|
|
||||||
.and_then(|s| if s.len() == 12 && s.chars().all(|c| c.is_ascii_alphanumeric()) { Some(s) } else { None })
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_ascii_uppercase();
|
|
||||||
|
|
||||||
// === 4. Final sanity check: reject obvious debt securities ===
|
|
||||||
let name_upper = long_name.to_ascii_uppercase();
|
|
||||||
if name_upper.contains(" BOND") ||
|
|
||||||
name_upper.contains(" NOTE") ||
|
|
||||||
name_upper.contains(" DEBENTURE") ||
|
|
||||||
name_upper.contains(" PREFERRED") && !name_upper.contains(" STOCK") {
|
|
||||||
return Err(anyhow::anyhow!("Security name suggests debt instrument: {}", long_name));
|
|
||||||
}
|
|
||||||
|
|
||||||
println!(
|
|
||||||
" → Valid equity: {} | {} | {} | ISIN: {}",
|
|
||||||
ticker,
|
|
||||||
long_name,
|
|
||||||
exchange_mic,
|
|
||||||
if isin.is_empty() { "N/A" } else { &isin }
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(PrimaryInfo {
|
|
||||||
isin,
|
|
||||||
name: long_name,
|
|
||||||
exchange_mic,
|
|
||||||
currency,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convert Yahoo's exchange name to MIC code (best effort)
|
|
||||||
fn exchange_name_to_mic(name: &str) -> String {
|
|
||||||
match name {
|
|
||||||
"NMS" | "NasdaqGS" | "NASDAQ" => "XNAS",
|
|
||||||
"NYQ" | "NYSE" => "XNYS",
|
|
||||||
"LSE" | "London" => "XLON",
|
|
||||||
"FRA" | "Frankfurt" | "GER" | "XETRA" => "XFRA",
|
|
||||||
"PAR" | "Paris" => "XPAR",
|
|
||||||
"AMS" | "Amsterdam" => "XAMS",
|
|
||||||
"MIL" | "Milan" => "XMIL",
|
|
||||||
"JPX" | "Tokyo" => "XJPX",
|
|
||||||
"HKG" | "Hong Kong" => "XHKG",
|
|
||||||
"SHH" | "Shanghai" => "XSHG",
|
|
||||||
"SHZ" | "Shenzhen" => "XSHE",
|
|
||||||
"TOR" | "Toronto" => "XTSE",
|
|
||||||
"ASX" | "Australia" => "XASX",
|
|
||||||
"SAU" | "Saudi" => "XSAU",
|
|
||||||
"SWX" | "Switzerland" => "XSWX",
|
|
||||||
"BSE" | "Bombay" => "XBSE",
|
|
||||||
"NSE" | "NSI" => "XNSE",
|
|
||||||
"TAI" | "Taiwan" => "XTAI",
|
|
||||||
"SAO" | "Sao Paulo" => "BVMF",
|
|
||||||
"MCE" | "Madrid" => "XMAD",
|
|
||||||
_ => name, // Fallback to name itself
|
|
||||||
}.to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
|
||||||
///
|
|
||||||
/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
|
|
||||||
/// reject cookies, and extract the events.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `ticker` - The stock ticker symbol.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of CompanyEvent structs on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
|
|
||||||
pub async fn fetch_earnings_with_pool(
|
|
||||||
ticker: &str,
|
|
||||||
pool: &Arc<ChromeDriverPool>,
|
|
||||||
) -> anyhow::Result<Vec<CompanyEvent>> {
|
|
||||||
let ticker = ticker.to_string();
|
|
||||||
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker);
|
|
||||||
|
|
||||||
let ticker_cloned = ticker.clone();
|
|
||||||
|
|
||||||
pool.execute(url, move |client| {
|
|
||||||
let ticker = ticker_cloned.clone();
|
|
||||||
Box::pin(async move {
|
|
||||||
reject_yahoo_cookies(&client).await?;
|
|
||||||
extract_earnings_events(&client, &ticker).await
|
|
||||||
})
|
|
||||||
}).await
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
|
|
||||||
///
|
|
||||||
/// This function assumes the client is already navigated to the correct URL (e.g.,
|
|
||||||
/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
|
|
||||||
///
|
|
||||||
/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
|
|
||||||
/// and handles date parsing, float parsing, and optional fields.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `client` - The fantoccini Client with the page loaded.
|
|
||||||
/// * `ticker` - The stock ticker symbol for the events.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of CompanyEvent on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if:
|
|
||||||
/// - Table or elements not found.
|
|
||||||
/// - Date or float parsing fails.
|
|
||||||
/// - WebDriver operations fail.
|
|
||||||
///
|
|
||||||
/// # Examples
|
|
||||||
///
|
|
||||||
/// ```no_run
|
|
||||||
/// use fantoccini::Client;
|
|
||||||
/// use crate::corporate::scraper::extract_earnings;
|
|
||||||
///
|
|
||||||
/// #[tokio::main]
|
|
||||||
/// async fn main() -> Result<()> {
|
|
||||||
/// // Assume client is set up and navigated
|
|
||||||
/// let events = extract_earnings(&client, "AAPL").await?;
|
|
||||||
/// Ok(())
|
|
||||||
/// }
|
|
||||||
/// ```
|
|
||||||
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
|
|
||||||
// Wait for the table to load
|
|
||||||
let table = client
|
|
||||||
.wait()
|
|
||||||
.for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
|
|
||||||
|
|
||||||
// Find all rows in tbody
|
|
||||||
let rows = table
|
|
||||||
.find_all(Locator::Css("tbody tr"))
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
|
|
||||||
|
|
||||||
let mut events = Vec::with_capacity(rows.len());
|
|
||||||
|
|
||||||
for row in rows {
|
|
||||||
let cells = row
|
|
||||||
.find_all(Locator::Css("td"))
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
|
|
||||||
|
|
||||||
if cells.len() < 5 {
|
|
||||||
continue; // Skip incomplete rows
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract and parse date
|
|
||||||
let date_str = cells[0]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get date text: {}", e))?;
|
|
||||||
let date = parse_yahoo_date(&date_str)
|
|
||||||
.map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
|
|
||||||
.format("%Y-%m-%d")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
// Extract time, replace "Time Not Supplied" with empty
|
|
||||||
let time = cells[1]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get time text: {}", e))?
|
|
||||||
.replace("Time Not Supplied", "");
|
|
||||||
|
|
||||||
// Extract period
|
|
||||||
let period = cells[2]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get period text: {}", e))?;
|
|
||||||
|
|
||||||
// Parse EPS forecast
|
|
||||||
let eps_forecast_str = cells[3]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
|
|
||||||
let eps_forecast = parse_float(&eps_forecast_str);
|
|
||||||
|
|
||||||
// Parse EPS actual
|
|
||||||
let eps_actual_str = cells[4]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
|
|
||||||
let eps_actual = parse_float(&eps_actual_str);
|
|
||||||
|
|
||||||
// Parse surprise % if available
|
|
||||||
let surprise_pct = if cells.len() > 5 {
|
|
||||||
let surprise_str = cells[5]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
|
|
||||||
parse_float(&surprise_str)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
events.push(CompanyEvent {
|
|
||||||
ticker: ticker.to_string(),
|
|
||||||
date,
|
|
||||||
time,
|
|
||||||
period,
|
|
||||||
eps_forecast,
|
|
||||||
eps_actual,
|
|
||||||
revenue_forecast: None,
|
|
||||||
revenue_actual: None,
|
|
||||||
surprise_pct,
|
|
||||||
source: "Yahoo".to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if events.is_empty() {
|
|
||||||
eprintln!("Warning: No earnings events extracted for ticker {}", ticker);
|
|
||||||
} else {
|
|
||||||
println!("Extracted {} earnings events for {}", events.len(), ticker);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(events)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_price(v: Option<&Value>) -> f64 {
|
|
||||||
v.and_then(|x| x.as_str())
|
|
||||||
.and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
|
|
||||||
.or_else(|| v.and_then(|x| x.as_f64()))
|
|
||||||
.unwrap_or(0.0)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_volume(v: Option<&Value>) -> u64 {
|
|
||||||
v.and_then(|x| x.as_str())
|
|
||||||
.and_then(|s| s.replace(',', "").parse::<u64>().ok())
|
|
||||||
.or_else(|| v.and_then(|x| x.as_u64()))
|
|
||||||
.unwrap_or(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn fetch_daily_price_history(
|
|
||||||
ticker: &str,
|
|
||||||
start_str: &str,
|
|
||||||
end_str: &str,
|
|
||||||
) -> anyhow::Result<Vec<CompanyPrice>> {
|
|
||||||
let start = NaiveDate::parse_from_str(start_str, "%Y-%m-%d")?;
|
|
||||||
let end = NaiveDate::parse_from_str(end_str, "%Y-%m-%d")? + Duration::days(1);
|
|
||||||
|
|
||||||
let mut all_prices = Vec::new();
|
|
||||||
let mut current = start;
|
|
||||||
|
|
||||||
while current < end {
|
|
||||||
let chunk_end = current + Duration::days(730);
|
|
||||||
let actual_end = chunk_end.min(end);
|
|
||||||
|
|
||||||
let period1 = current.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
|
|
||||||
let period2 = actual_end.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
|
|
||||||
|
|
||||||
println!(" Fetching {ticker} {} → {}", current, actual_end - Duration::days(1));
|
|
||||||
|
|
||||||
let url = format!(
|
|
||||||
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=1d&includeAdjustedClose=true"
|
|
||||||
);
|
|
||||||
|
|
||||||
let json: Value = HttpClient::new()
|
|
||||||
.get(&url)
|
|
||||||
.header("User-Agent", USER_AGENT)
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let result = &json["chart"]["result"][0];
|
|
||||||
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
|
|
||||||
let quote = &result["indicators"]["quote"][0];
|
|
||||||
let meta = &result["meta"];
|
|
||||||
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
|
|
||||||
|
|
||||||
let opens = quote["open"].as_array();
|
|
||||||
let highs = quote["high"].as_array();
|
|
||||||
let lows = quote["low"].as_array();
|
|
||||||
let closes = quote["close"].as_array();
|
|
||||||
let adj_closes = result["indicators"]["adjclose"][0]["adjclose"].as_array()
|
|
||||||
.or_else(|| closes);
|
|
||||||
let volumes = quote["volume"].as_array();
|
|
||||||
|
|
||||||
for (i, ts_val) in timestamps.iter().enumerate() {
|
|
||||||
let ts = ts_val.as_i64().unwrap_or(0);
|
|
||||||
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
|
|
||||||
let date_str = dt.format("%Y-%m-%d").to_string();
|
|
||||||
|
|
||||||
if date_str < start_str.to_string() || date_str > end_str.to_string() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let open = parse_price(opens.and_then(|a| a.get(i)));
|
|
||||||
let high = parse_price(highs.and_then(|a| a.get(i)));
|
|
||||||
let low = parse_price(lows.and_then(|a| a.get(i)));
|
|
||||||
let close = parse_price(closes.and_then(|a| a.get(i)));
|
|
||||||
let adj_close = parse_price(adj_closes.and_then(|a| a.get(i)));
|
|
||||||
let volume = parse_volume(volumes.and_then(|a| a.get(i)));
|
|
||||||
|
|
||||||
all_prices.push(CompanyPrice {
|
|
||||||
ticker: ticker.to_string(),
|
|
||||||
date: date_str,
|
|
||||||
time: "".to_string(),
|
|
||||||
open,
|
|
||||||
high,
|
|
||||||
low,
|
|
||||||
close,
|
|
||||||
adj_close,
|
|
||||||
volume,
|
|
||||||
currency: currency.clone(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
sleep(TokioDuration::from_millis(200)).await;
|
|
||||||
current = actual_end;
|
|
||||||
}
|
|
||||||
|
|
||||||
all_prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
|
||||||
all_prices.dedup_by(|a, b| a.date == b.date && a.time == b.time);
|
|
||||||
|
|
||||||
println!(" Got {} daily bars for {ticker}", all_prices.len());
|
|
||||||
Ok(all_prices)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn fetch_price_history_5min(
|
|
||||||
ticker: &str,
|
|
||||||
_start: &str,
|
|
||||||
_end: &str,
|
|
||||||
) -> anyhow::Result<Vec<CompanyPrice>> {
|
|
||||||
let now = Utc::now().timestamp();
|
|
||||||
let period1 = now - 5184000;
|
|
||||||
let period2 = now;
|
|
||||||
|
|
||||||
let url = format!(
|
|
||||||
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=5m&includeAdjustedClose=true"
|
|
||||||
);
|
|
||||||
|
|
||||||
let json: Value = HttpClient::new()
|
|
||||||
.get(&url)
|
|
||||||
.header("User-Agent", USER_AGENT)
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let result = &json["chart"]["result"][0];
|
|
||||||
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
|
|
||||||
let quote = &result["indicators"]["quote"][0];
|
|
||||||
let meta = &result["meta"];
|
|
||||||
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
|
|
||||||
|
|
||||||
let mut prices = Vec::new();
|
|
||||||
|
|
||||||
for (i, ts_val) in timestamps.iter().enumerate() {
|
|
||||||
let ts = ts_val.as_i64().unwrap_or(0);
|
|
||||||
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
|
|
||||||
let date_str = dt.format("%Y-%m-%d").to_string();
|
|
||||||
let time_str = dt.format("%H:%M:%S").to_string();
|
|
||||||
|
|
||||||
let open = parse_price(quote["open"].as_array().and_then(|a| a.get(i)));
|
|
||||||
let high = parse_price(quote["high"].as_array().and_then(|a| a.get(i)));
|
|
||||||
let low = parse_price(quote["low"].as_array().and_then(|a| a.get(i)));
|
|
||||||
let close = parse_price(quote["close"].as_array().and_then(|a| a.get(i)));
|
|
||||||
let volume = parse_volume(quote["volume"].as_array().and_then(|a| a.get(i)));
|
|
||||||
|
|
||||||
prices.push(CompanyPrice {
|
|
||||||
ticker: ticker.to_string(),
|
|
||||||
date: date_str,
|
|
||||||
time: time_str,
|
|
||||||
open,
|
|
||||||
high,
|
|
||||||
low,
|
|
||||||
close,
|
|
||||||
adj_close: close,
|
|
||||||
volume,
|
|
||||||
currency: currency.clone(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
|
||||||
Ok(prices)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF
|
/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF
|
||||||
/// Overengineered; we could just use the static URL, but this shows how to scrape if needed
|
/// Overengineered; we could just use the static URL, but this shows how to scrape if needed
|
||||||
pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> {
|
pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> {
|
||||||
@@ -670,66 +24,126 @@ pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow
|
|||||||
|
|
||||||
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
||||||
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
||||||
let zip_path = "data/gleif/isin_lei.zip";
|
|
||||||
let csv_path = "data/gleif/isin_lei.csv";
|
|
||||||
|
|
||||||
if let Err(e) = std::fs::create_dir_all("data") {
|
let paths = DataPaths::new(".")?;
|
||||||
println!("Failed to create data directory: {e}");
|
let gleif_cache_dir = paths.cache_gleif_dir();
|
||||||
|
|
||||||
|
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
|
||||||
|
let msg = format!("Failed to create cache/gleif directory: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download ZIP
|
logger::log_info("Downloading ISIN/LEI mapping from GLEIF...").await;
|
||||||
let bytes = match reqwest::Client::builder()
|
|
||||||
|
let client = match reqwest::Client::builder()
|
||||||
.user_agent(USER_AGENT)
|
.user_agent(USER_AGENT)
|
||||||
.timeout(std::time::Duration::from_secs(30))
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
.build()
|
.build()
|
||||||
.and_then(|c| Ok(c))
|
|
||||||
{
|
{
|
||||||
Ok(client) => match client.get(url).send().await {
|
Ok(c) => c,
|
||||||
Ok(resp) if resp.status().is_success() => match resp.bytes().await {
|
|
||||||
Ok(b) => b,
|
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("Failed to read ZIP bytes: {e}");
|
logger::log_error(&format!("Failed to create HTTP client: {}", e)).await;
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Ok(resp) => {
|
|
||||||
println!("Server returned HTTP {}", resp.status());
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
println!("Failed to download ISIN/LEI ZIP: {e}");
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
println!("Failed to create HTTP client: {e}");
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Err(e) = tokio::fs::write(zip_path, &bytes).await {
|
let resp = match client.get(url).send().await {
|
||||||
println!("Failed to write ZIP file: {e}");
|
Ok(r) if r.status().is_success() => r,
|
||||||
return Ok(None);
|
Ok(resp) => {
|
||||||
}
|
logger::log_error(&format!("Server returned HTTP {}", resp.status())).await;
|
||||||
|
|
||||||
// Extract CSV
|
|
||||||
let archive = match std::fs::File::open(zip_path)
|
|
||||||
.map(ZipArchive::new)
|
|
||||||
{
|
|
||||||
Ok(Ok(a)) => a,
|
|
||||||
Ok(Err(e)) => {
|
|
||||||
println!("Invalid ZIP: {e}");
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("Cannot open ZIP file: {e}");
|
logger::log_error(&format!("Failed to download: {}", e)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let filename = resp
|
||||||
|
.headers()
|
||||||
|
.get("content-disposition")
|
||||||
|
.and_then(|h| h.to_str().ok())
|
||||||
|
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
|
||||||
|
.unwrap_or_else(|| "isin_lei.zip".to_string());
|
||||||
|
|
||||||
|
let parsed_filename = parse_gleif_filename(&filename);
|
||||||
|
logger::log_info(&format!("Downloaded: {} -> {}", filename, parsed_filename)).await;
|
||||||
|
|
||||||
|
// Extract date from filename
|
||||||
|
let mut date_str = String::new();
|
||||||
|
if let Some(start_idx) = parsed_filename.find("isin-lei-") {
|
||||||
|
let rest = &parsed_filename[start_idx + 9..];
|
||||||
|
if rest.len() >= 8 {
|
||||||
|
date_str = rest[0..8].to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let date_dir = if !date_str.is_empty() {
|
||||||
|
let p = gleif_cache_dir.join(&date_str);
|
||||||
|
if let Err(e) = std::fs::create_dir_all(&p) {
|
||||||
|
logger::log_warn(&format!("Failed to create date directory: {}", e)).await;
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(p)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());
|
||||||
|
|
||||||
|
// Check for existing clean CSV
|
||||||
|
if let Some(ref ddir) = date_dir {
|
||||||
|
if let Ok(entries) = std::fs::read_dir(ddir) {
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
if let Some(name) = entry.file_name().to_str() {
|
||||||
|
if name.to_lowercase().ends_with("_clean.csv") {
|
||||||
|
let path = ddir.join(name);
|
||||||
|
logger::log_info(&format!("Found existing clean CSV: {}", path.display())).await;
|
||||||
|
return Ok(Some(path.to_string_lossy().to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let csv_candidate = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
||||||
|
if csv_candidate.exists() {
|
||||||
|
logger::log_info(&format!("Found existing CSV: {}", csv_candidate.display())).await;
|
||||||
|
return Ok(Some(csv_candidate.to_string_lossy().to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = match resp.bytes().await {
|
||||||
|
Ok(b) => b,
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("Failed to read bytes: {}", e)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let zip_path = target_dir.join(&parsed_filename);
|
||||||
|
let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
||||||
|
|
||||||
|
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
|
||||||
|
logger::log_error(&format!("Failed to write ZIP: {}", e)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract CSV from ZIP
|
||||||
|
let archive = match std::fs::File::open(&zip_path).map(ZipArchive::new) {
|
||||||
|
Ok(Ok(a)) => a,
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
logger::log_error(&format!("Invalid ZIP: {}", e)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("Cannot open ZIP: {}", e)).await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut archive = archive;
|
let mut archive = archive;
|
||||||
|
|
||||||
let idx = match (0..archive.len()).find(|&i| {
|
let idx = match (0..archive.len()).find(|&i| {
|
||||||
archive.by_index(i)
|
archive.by_index(i)
|
||||||
.map(|f| f.name().ends_with(".csv"))
|
.map(|f| f.name().ends_with(".csv"))
|
||||||
@@ -737,7 +151,7 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
}) {
|
}) {
|
||||||
Some(i) => i,
|
Some(i) => i,
|
||||||
None => {
|
None => {
|
||||||
println!("ZIP did not contain a CSV file");
|
logger::log_error("ZIP contains no CSV").await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -745,25 +159,44 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
let mut csv_file = match archive.by_index(idx) {
|
let mut csv_file = match archive.by_index(idx) {
|
||||||
Ok(f) => f,
|
Ok(f) => f,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("Failed to read CSV entry: {e}");
|
logger::log_error(&format!("Failed to read CSV: {}", e)).await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut csv_bytes = Vec::new();
|
let mut csv_bytes = Vec::new();
|
||||||
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
||||||
println!("Failed to extract CSV: {e}");
|
logger::log_error(&format!("Failed to extract: {}", e)).await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = tokio::fs::write(csv_path, &csv_bytes).await {
|
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
|
||||||
println!("Failed to save CSV file: {e}");
|
logger::log_error(&format!("Failed to save CSV: {}", e)).await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Some(csv_path.to_string()))
|
logger::log_info(&format!("✓ CSV extracted: {:?}", csv_path)).await;
|
||||||
|
Ok(Some(csv_path.to_string_lossy().to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_gleif_filename(filename: &str) -> String {
|
||||||
|
if let Some(start_idx) = filename.find("isin-lei-") {
|
||||||
|
let rest = &filename[start_idx + 9..];
|
||||||
|
|
||||||
|
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
|
||||||
|
let date_part = &rest[0..8];
|
||||||
|
if date_part.len() == 8 {
|
||||||
|
let year = &date_part[0..4];
|
||||||
|
let month = &date_part[4..6];
|
||||||
|
let day = &date_part[6..8];
|
||||||
|
let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" };
|
||||||
|
return format!("isin-lei-{}{}{}{}", day, month, year, extension);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
filename.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
|
pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
|
||||||
// 1. Download + extract the CSV (this is now async)
|
// 1. Download + extract the CSV (this is now async)
|
||||||
@@ -813,29 +246,3 @@ pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>>
|
|||||||
|
|
||||||
Ok(map)
|
Ok(map)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
|
|
||||||
for _ in 0..10 {
|
|
||||||
let clicked: bool = client
|
|
||||||
.execute(
|
|
||||||
r#"(() => {
|
|
||||||
const btn = document.querySelector('#consent-page .reject-all');
|
|
||||||
if (btn) {
|
|
||||||
btn.click();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
})()"#,
|
|
||||||
vec![],
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
.as_bool()
|
|
||||||
.unwrap_or(false);
|
|
||||||
|
|
||||||
if clicked { break; }
|
|
||||||
sleep(TokioDuration::from_millis(500)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("Rejected Yahoo cookies if button existed");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -1,237 +1,87 @@
|
|||||||
// src/corporate/storage.rs
|
// src/corporate/storage.rs
|
||||||
use super::{types::*, helpers::*};
|
use crate::util::directories::DataPaths;
|
||||||
use crate::config;
|
use crate::util::logger;
|
||||||
|
|
||||||
use tokio::fs;
|
use tokio::io::AsyncWriteExt;
|
||||||
use chrono::{Datelike, NaiveDate};
|
use std::collections::HashMap;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::path::{PathBuf, Path};
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
|
|
||||||
pub async fn load_existing_events() -> anyhow::Result<HashMap<String, CompanyEvent>> {
|
|
||||||
let mut map = HashMap::new();
|
|
||||||
let dir = std::path::Path::new("corporate_events");
|
|
||||||
if !dir.exists() {
|
|
||||||
return Ok(map);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut entries = fs::read_dir(dir).await?;
|
/// Lightweight index entry - only metadata, no full event data
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
#[derive(Debug, Clone)]
|
||||||
let path = entry.path();
|
pub struct EventIndex {
|
||||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
pub key: String,
|
||||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
pub ticker: String,
|
||||||
if name.starts_with("events_") && name.len() == 17 {
|
pub date: String,
|
||||||
let content = fs::read_to_string(&path).await?;
|
pub file_path: PathBuf,
|
||||||
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
|
||||||
for event in events {
|
|
||||||
map.insert(event_key(&event), event);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(map)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_optimized_events(events: HashMap<String, CompanyEvent>) -> anyhow::Result<()> {
|
/// Stream companies to JSONL incrementally
|
||||||
let dir = std::path::Path::new("corporate_events");
|
pub async fn save_companies_to_jsonl_streaming(
|
||||||
fs::create_dir_all(dir).await?;
|
paths: &DataPaths,
|
||||||
|
companies_iter: impl Iterator<Item = (String, HashMap<String, String>)>,
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
|
let file_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
|
||||||
let mut entries = fs::read_dir(dir).await?;
|
if let Some(parent) = file_path.parent() {
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
let path = entry.path();
|
}
|
||||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
|
||||||
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
let mut file = tokio::fs::File::create(&file_path).await?;
|
||||||
fs::remove_file(&path).await?;
|
let mut count = 0;
|
||||||
|
|
||||||
|
for (name, securities) in companies_iter {
|
||||||
|
let line = serde_json::json!({
|
||||||
|
"name": name,
|
||||||
|
"securities": securities
|
||||||
|
});
|
||||||
|
|
||||||
|
file.write_all(line.to_string().as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
count += 1;
|
||||||
|
|
||||||
|
if count % 100 == 0 {
|
||||||
|
tokio::task::yield_now().await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut sorted: Vec<_> = events.into_values().collect();
|
logger::log_info(&format!("Saved {} companies to JSONL", count)).await;
|
||||||
sorted.sort_by_key(|e| (e.ticker.clone(), e.date.clone()));
|
Ok(count)
|
||||||
|
|
||||||
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
|
|
||||||
for e in sorted {
|
|
||||||
if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") {
|
|
||||||
let key = format!("{}-{:02}", d.year(), d.month());
|
|
||||||
by_month.entry(key).or_default().push(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (month, list) in by_month {
|
|
||||||
let path = dir.join(format!("events_{}.json", month));
|
|
||||||
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_changes(changes: &[CompanyEventChange]) -> anyhow::Result<()> {
|
/// Stream read companies from JSONL
|
||||||
if changes.is_empty() { return Ok(()); }
|
pub async fn stream_companies_from_jsonl<F>(
|
||||||
let dir = std::path::Path::new("corporate_event_changes");
|
path: &Path,
|
||||||
fs::create_dir_all(dir).await?;
|
mut callback: F
|
||||||
|
) -> anyhow::Result<usize>
|
||||||
|
where
|
||||||
|
F: FnMut(String, HashMap<String, String>) -> anyhow::Result<()>,
|
||||||
|
{
|
||||||
|
if !path.exists() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
|
let content = tokio::fs::read_to_string(path).await?;
|
||||||
for c in changes {
|
let mut count = 0;
|
||||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
|
||||||
let key = format!("{}-{:02}", d.year(), d.month());
|
for line in content.lines() {
|
||||||
by_month.entry(key).or_default().push(c.clone());
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let entry: serde_json::Value = serde_json::from_str(line)?;
|
||||||
|
let name = entry["name"].as_str().unwrap_or("").to_string();
|
||||||
|
let securities: HashMap<String, String> = serde_json::from_value(
|
||||||
|
entry["securities"].clone()
|
||||||
|
)?;
|
||||||
|
|
||||||
|
callback(name, securities)?;
|
||||||
|
count += 1;
|
||||||
|
|
||||||
|
if count % 100 == 0 {
|
||||||
|
tokio::task::yield_now().await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (month, list) in by_month {
|
Ok(count)
|
||||||
let path = dir.join(format!("changes_{}.json", month));
|
|
||||||
let mut all = if path.exists() {
|
|
||||||
let s = fs::read_to_string(&path).await?;
|
|
||||||
serde_json::from_str(&s).unwrap_or_default()
|
|
||||||
} else { vec![] };
|
|
||||||
all.extend(list);
|
|
||||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn save_prices_for_ticker(ticker: &str, timeframe: &str, mut prices: Vec<CompanyPrice>) -> anyhow::Result<()> {
|
|
||||||
let base_dir = Path::new("corporate_prices");
|
|
||||||
let company_dir = base_dir.join(ticker.replace(".", "_"));
|
|
||||||
let timeframe_dir = company_dir.join(timeframe);
|
|
||||||
|
|
||||||
fs::create_dir_all(&timeframe_dir).await?;
|
|
||||||
let path = timeframe_dir.join("prices.json");
|
|
||||||
|
|
||||||
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
|
||||||
|
|
||||||
let json = serde_json::to_string_pretty(&prices)?;
|
|
||||||
fs::write(&path, json).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_company_dir(lei: &str) -> PathBuf {
|
|
||||||
PathBuf::from("corporate_prices").join(lei)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn ensure_company_dirs(isin: &str) -> anyhow::Result<()> {
|
|
||||||
let base = get_company_dir(isin);
|
|
||||||
let paths = [
|
|
||||||
base.clone(),
|
|
||||||
base.join("5min"),
|
|
||||||
base.join("daily"),
|
|
||||||
base.join("aggregated").join("5min"),
|
|
||||||
base.join("aggregated").join("daily"),
|
|
||||||
];
|
|
||||||
for p in paths {
|
|
||||||
fs::create_dir_all(&p).await?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn save_available_exchanges(isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
|
|
||||||
let dir = get_company_dir(isin);
|
|
||||||
fs::create_dir_all(&dir).await?;
|
|
||||||
let path = dir.join("available_exchanges.json");
|
|
||||||
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn load_available_exchanges(lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
|
|
||||||
let path = get_company_dir(lei).join("available_exchanges.json");
|
|
||||||
if path.exists() {
|
|
||||||
let content = fs::read_to_string(&path).await?;
|
|
||||||
Ok(serde_json::from_str(&content)?)
|
|
||||||
} else {
|
|
||||||
Ok(vec![])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn save_prices_by_source(
|
|
||||||
lei: &str,
|
|
||||||
source_ticker: &str,
|
|
||||||
timeframe: &str,
|
|
||||||
prices: Vec<CompanyPrice>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
|
|
||||||
let dir = get_company_dir(lei).join(timeframe).join(&source_safe);
|
|
||||||
fs::create_dir_all(&dir).await?;
|
|
||||||
let path = dir.join("prices.json");
|
|
||||||
let mut prices = prices;
|
|
||||||
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
|
||||||
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Update available_exchanges.json with fetch results
|
|
||||||
pub async fn update_available_exchange(
|
|
||||||
isin: &str,
|
|
||||||
ticker: &str,
|
|
||||||
exchange_mic: &str,
|
|
||||||
has_daily: bool,
|
|
||||||
has_5min: bool,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let mut exchanges = load_available_exchanges(isin).await?;
|
|
||||||
|
|
||||||
if let Some(entry) = exchanges.iter_mut().find(|e| e.ticker == ticker) {
|
|
||||||
// Update existing entry
|
|
||||||
entry.record_success(has_daily, has_5min);
|
|
||||||
} else {
|
|
||||||
// Create new entry - need to get currency from somewhere
|
|
||||||
// Try to infer from the ticker or use a default
|
|
||||||
let currency = infer_currency_from_ticker(ticker);
|
|
||||||
let mut new_entry = AvailableExchange::new(
|
|
||||||
ticker.to_string(),
|
|
||||||
exchange_mic.to_string(),
|
|
||||||
currency,
|
|
||||||
);
|
|
||||||
new_entry.record_success(has_daily, has_5min);
|
|
||||||
exchanges.push(new_entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
save_available_exchanges(isin, exchanges).await
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Add a newly discovered exchange before fetching
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `isin` - The ISIN associated with the exchange.
|
|
||||||
/// * `figi_info` - The FigiInfo containing ticker, mic_code, and currency.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// Ok(()) on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if loading or saving available exchanges fails.
|
|
||||||
pub async fn add_discovered_exchange(
|
|
||||||
isin: &str,
|
|
||||||
figi_info: &FigiInfo,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let mut exchanges = load_available_exchanges(isin).await?;
|
|
||||||
|
|
||||||
// Only add if not already present
|
|
||||||
if !exchanges.iter().any(|e| e.ticker == figi_info.ticker && e.exchange_mic == figi_info.mic_code) {
|
|
||||||
let new_entry = AvailableExchange::new(
|
|
||||||
figi_info.ticker.clone(),
|
|
||||||
figi_info.mic_code.clone(),
|
|
||||||
figi_info.currency.clone(),
|
|
||||||
);
|
|
||||||
exchanges.push(new_entry);
|
|
||||||
save_available_exchanges(isin, exchanges).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Infer currency from ticker suffix
|
|
||||||
fn infer_currency_from_ticker(ticker: &str) -> String {
|
|
||||||
if ticker.ends_with(".L") { return "GBP".to_string(); }
|
|
||||||
if ticker.ends_with(".PA") { return "EUR".to_string(); }
|
|
||||||
if ticker.ends_with(".DE") { return "EUR".to_string(); }
|
|
||||||
if ticker.ends_with(".AS") { return "EUR".to_string(); }
|
|
||||||
if ticker.ends_with(".MI") { return "EUR".to_string(); }
|
|
||||||
if ticker.ends_with(".SW") { return "CHF".to_string(); }
|
|
||||||
if ticker.ends_with(".T") { return "JPY".to_string(); }
|
|
||||||
if ticker.ends_with(".HK") { return "HKD".to_string(); }
|
|
||||||
if ticker.ends_with(".SS") { return "CNY".to_string(); }
|
|
||||||
if ticker.ends_with(".SZ") { return "CNY".to_string(); }
|
|
||||||
if ticker.ends_with(".TO") { return "CAD".to_string(); }
|
|
||||||
if ticker.ends_with(".AX") { return "AUD".to_string(); }
|
|
||||||
if ticker.ends_with(".SA") { return "BRL".to_string(); }
|
|
||||||
if ticker.ends_with(".MC") { return "EUR".to_string(); }
|
|
||||||
if ticker.ends_with(".BO") || ticker.ends_with(".NS") { return "INR".to_string(); }
|
|
||||||
|
|
||||||
"USD".to_string() // Default
|
|
||||||
}
|
}
|
||||||
@@ -1,10 +1,174 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
// src/corporate/types.rs
|
// src/corporate/types.rs
|
||||||
|
use std::collections::HashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ChartData {
|
||||||
|
pub symbol: String,
|
||||||
|
pub quotes: Vec<Quote>,
|
||||||
|
pub timestamp: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct Quote {
|
||||||
|
pub timestamp: i64,
|
||||||
|
pub open: Option<f64>,
|
||||||
|
pub high: Option<f64>,
|
||||||
|
pub low: Option<f64>,
|
||||||
|
pub close: Option<f64>,
|
||||||
|
pub volume: Option<u64>,
|
||||||
|
pub adjusted_close: Option<f64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Figi Info based on API calls [https://www.openfigi.com/]
|
||||||
|
/// # Attributes
|
||||||
|
/// isin: ISIN belonging to this legal entity from lei
|
||||||
|
///
|
||||||
|
/// # Comments
|
||||||
|
/// Use Mapping the Object List onto Figi Properties
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct FigiData {
|
||||||
|
pub isin: String,
|
||||||
|
pub figi: String,
|
||||||
|
pub name: String,
|
||||||
|
pub ticker: String,
|
||||||
|
pub exch_code: String,
|
||||||
|
#[serde(rename = "compositeFIGI")]
|
||||||
|
pub composite_figi: String,
|
||||||
|
#[serde(rename = "securityType")]
|
||||||
|
pub security_type: String,
|
||||||
|
#[serde(rename = "marketSector")]
|
||||||
|
pub market_sector: String,
|
||||||
|
#[serde(rename = "shareClassFIGI")]
|
||||||
|
pub share_class_figi: String,
|
||||||
|
#[serde(rename = "securityType2")]
|
||||||
|
pub security_type2: String,
|
||||||
|
#[serde(rename = "securityDescription")]
|
||||||
|
pub security_description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Company Info
|
||||||
|
/// # Attributes
|
||||||
|
/// * Name as primary key (for one institution) -> might have to changed when first FigiInfo is coming in
|
||||||
|
/// * ISIN as the most liquid / preferred traded security (used for fallback)
|
||||||
|
/// * securities: Grouped by ISIN, filtered for Common Stock only
|
||||||
|
/// * isin_tickers_map: Map of ISINs to their associated tickers across platforms
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CompanyData{
|
||||||
|
pub name: String,
|
||||||
|
pub primary_isin: String,
|
||||||
|
pub securities: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo>
|
||||||
|
pub yahoo_company_data: Option<Vec<YahooCompanyData>>,
|
||||||
|
pub isin_tickers_map: Option<HashMap<String, Vec<String>>>, // ISIN -> Tickers
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct YahooCompanyData {
|
||||||
|
pub ticker: String,
|
||||||
|
pub sector: Option<String>,
|
||||||
|
pub exchange: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct WarrantData {
|
||||||
|
pub company_name: String, // key in CompanyData
|
||||||
|
pub warrants: HashMap<String, WarrantDetails>, // underlying company name -> Warrant
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Warrant Data
|
||||||
|
///
|
||||||
|
/// Information for Warrant securities fetched out of Name in FigiData
|
||||||
|
/// example1: "name": "VONTOBE-PW26 LEONARDO SPA",
|
||||||
|
/// issued by VONTOBEL Put Warrant for underlying company LEONARDO SPA
|
||||||
|
/// example2: "BAYER H-CW25 L'OREAL",
|
||||||
|
/// other formats like only on company instead of two, underlying and issuing company are the same, leave issuer_company_name NULL
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct WarrantDetails {
|
||||||
|
pub company_name: String, // key in CompanyData, key for WarrantDetails
|
||||||
|
pub issuer_company_name: Option<String>, // key in CompanyData
|
||||||
|
pub warrant_type: String, // "put" or "call"
|
||||||
|
pub warrants: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiData> (grouped by ISIN)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct OptionData {
|
||||||
|
pub company_name: String, // key in CompanyData
|
||||||
|
pub expiration_dates: Vec<i64>,
|
||||||
|
pub strikes: Vec<f64>,
|
||||||
|
pub option: Vec<OptionChain>,
|
||||||
|
pub timestamp: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct OptionChain {
|
||||||
|
pub expiration_date: i64,
|
||||||
|
pub calls: Vec<OptionContract>,
|
||||||
|
pub puts: Vec<OptionContract>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct OptionContract {
|
||||||
|
pub strike: f64,
|
||||||
|
pub last_price: Option<f64>,
|
||||||
|
pub bid: Option<f64>,
|
||||||
|
pub ask: Option<f64>,
|
||||||
|
pub volume: Option<u64>,
|
||||||
|
pub open_interest: Option<u64>,
|
||||||
|
pub implied_volatility: Option<f64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bond parsed details from ticker/description
|
||||||
|
///
|
||||||
|
/// Parses bond information from ticker format:
|
||||||
|
/// Corporate: "WTFC 4.3 01/12/26 0003"
|
||||||
|
/// Government: "SLOVAK 1.5225 05/10/28 4Y"
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct BondDetails {
|
||||||
|
pub coupon_rate: Option<f64>, // 4.3, 1.5225
|
||||||
|
pub maturity_date: Option<String>, // "2026-01-12", "2028-05-10"
|
||||||
|
pub is_floating: bool, // true if "Float" in description
|
||||||
|
pub is_zero_coupon: bool, // true if coupon is 0
|
||||||
|
pub tenor_years: Option<u32>, // Parsed from maturity or inferred
|
||||||
|
pub series_identifier: Option<String>, // "0003", "4Y", "144A", "REGS", etc.
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Corporate Bond Info
|
||||||
|
///
|
||||||
|
/// Information for corporate bonds grouped by issuer
|
||||||
|
/// Example: "name": "LIBERTYVILLE BK & TRUST"
|
||||||
|
/// ticker: "WTFC 4.3 01/12/26 0003"
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CorporateBondData {
|
||||||
|
pub underlying_company_name: String, // key - company name issuing the bond
|
||||||
|
pub bonds: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
||||||
|
#[serde(skip_serializing_if = "HashMap::is_empty", default)]
|
||||||
|
pub bond_details: HashMap<String, BondDetails>, // ISIN -> parsed bond details
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Government Bond Info
|
||||||
|
///
|
||||||
|
/// Information for government bonds grouped by issuer (country/municipality)
|
||||||
|
/// Example: "name": "SLOVAK REPUBLIC"
|
||||||
|
/// ticker: "SLOVAK 1.5225 05/10/28 4Y"
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct GovernmentBondData {
|
||||||
|
pub issuer_name: String, // key - government entity name
|
||||||
|
pub issuer_type: String, // "sovereign", "municipal", "state", "province", etc.
|
||||||
|
pub bonds: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
||||||
|
#[serde(skip_serializing_if = "HashMap::is_empty", default)]
|
||||||
|
pub bond_details: HashMap<String, BondDetails>, // ISIN -> parsed bond details
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ExchangeData {
|
||||||
|
pub mic: String,
|
||||||
|
pub ticker: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub currency: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
pub struct CompanyEvent {
|
pub struct CompanyEventData {
|
||||||
pub ticker: String,
|
pub ticker: String,
|
||||||
pub date: String, // YYYY-MM-DD
|
pub date: String, // YYYY-MM-DD
|
||||||
pub time: String, // "AMC", "BMO", "TAS", or ""
|
pub time: String, // "AMC", "BMO", "TAS", or ""
|
||||||
@@ -18,21 +182,7 @@ pub struct CompanyEvent {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct CompanyPrice {
|
pub struct CompanyEventChangeData {
|
||||||
pub ticker: String,
|
|
||||||
pub date: String, // YYYY-MM-DD
|
|
||||||
pub time: String, // HH:MM:SS for intraday, "" for daily
|
|
||||||
pub open: f64,
|
|
||||||
pub high: f64,
|
|
||||||
pub low: f64,
|
|
||||||
pub close: f64,
|
|
||||||
pub adj_close: f64,
|
|
||||||
pub volume: u64,
|
|
||||||
pub currency: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct CompanyEventChange {
|
|
||||||
pub ticker: String,
|
pub ticker: String,
|
||||||
pub date: String,
|
pub date: String,
|
||||||
pub field_changed: String, // "time", "eps_forecast", "eps_actual", "new_event"
|
pub field_changed: String, // "time", "eps_forecast", "eps_actual", "new_event"
|
||||||
@@ -40,124 +190,3 @@ pub struct CompanyEventChange {
|
|||||||
pub new_value: String,
|
pub new_value: String,
|
||||||
pub detected_at: String,
|
pub detected_at: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Figi Info based on API calls [https://www.openfigi.com/]
|
|
||||||
/// # Attributes
|
|
||||||
/// isin: ISIN belonging to this legal entity from lei
|
|
||||||
///
|
|
||||||
/// # Comments
|
|
||||||
/// Use Mapping the Object List onto Figi Properties
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct FigiInfo {
|
|
||||||
pub isin: String,
|
|
||||||
pub figi: String,
|
|
||||||
pub name: String,
|
|
||||||
pub ticker: String,
|
|
||||||
pub mic_code: String,
|
|
||||||
pub currency: String,
|
|
||||||
pub compositeFIGI: String,
|
|
||||||
pub securityType: String,
|
|
||||||
pub marketSector: String,
|
|
||||||
pub shareClassFIGI: String,
|
|
||||||
pub securityType2: String,
|
|
||||||
pub securityDescription: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Company Meta Data
|
|
||||||
/// # Attributes
|
|
||||||
/// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
|
||||||
/// * figi: metadata with ISIN as key
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct CompanyMetadata {
|
|
||||||
pub lei: String,
|
|
||||||
pub figi: Option<Vec<FigiInfo>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Company Info
|
|
||||||
/// # Attributes
|
|
||||||
/// * Name as primary key (for one instition) -> might have to changed when first FigiInfo is coming in
|
|
||||||
/// * ISIN as the most liquid / preferred traded security (used for fallback)
|
|
||||||
/// * securities: Grouped by ISIN, filtered for Common Stock only
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct CompanyInfo{
|
|
||||||
pub name: String,
|
|
||||||
pub primary_isin: String,
|
|
||||||
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Warrant Info
|
|
||||||
///
|
|
||||||
/// Information for Warrant securities fetched out of Name in FigiInfo
|
|
||||||
/// example1: "name": "VONTOBE-PW26 LEONARDO SPA",
|
|
||||||
/// issued by VONTOBEL Put Warrant for underlying company LEONARDO SPA
|
|
||||||
/// example2: "BAYER H-CW25 L'OREAL",
|
|
||||||
/// other formats like only on company instead of two, underlying and issuing company are the same, leave issuer_company_name NULL
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct WarrantInfo {
|
|
||||||
pub underlying_company_name: String, // key in CompanyInfo, key for WarrantInfo
|
|
||||||
pub issuer_company_name: Option<String>, // key in CompanyInfo
|
|
||||||
pub warrant_type: String, // "put" or "call"
|
|
||||||
pub warrants: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Option Info
|
|
||||||
///
|
|
||||||
/// Information for Option securities fetched out of Name in FigiInfo
|
|
||||||
/// example1: "name": "December 25 Calls on ALPHA GA",
|
|
||||||
/// issued by NULL Call Option for underlying company ALPHA GA
|
|
||||||
/// other formats like only on company instead of two, underlying and issuing company are the same, leave issuer_company_name NULL
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct OptionInfo {
|
|
||||||
pub underlying_company_name: String, // key in CompanyInfo, key for OptionInfo
|
|
||||||
pub issuer_company_name: Option<String>, // key in CompanyInfo
|
|
||||||
pub option_type: String, // "put" or "call"
|
|
||||||
pub options: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct PrimaryInfo {
|
|
||||||
pub isin: String,
|
|
||||||
pub name: String,
|
|
||||||
pub exchange_mic: String,
|
|
||||||
pub currency: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct AvailableExchange {
|
|
||||||
pub exchange_mic: String,
|
|
||||||
pub ticker: String,
|
|
||||||
pub has_daily: bool,
|
|
||||||
pub has_5min: bool,
|
|
||||||
pub last_successful_fetch: Option<String>, // YYYY-MM-DD
|
|
||||||
#[serde(default)]
|
|
||||||
pub currency: String,
|
|
||||||
#[serde(default)]
|
|
||||||
pub discovered_at: Option<String>, // When this exchange was first discovered
|
|
||||||
#[serde(default)]
|
|
||||||
pub fetch_count: u32, // How many times successfully fetched
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AvailableExchange {
|
|
||||||
pub fn new(ticker: String, exchange_mic: String, currency: String) -> Self {
|
|
||||||
Self {
|
|
||||||
exchange_mic,
|
|
||||||
ticker,
|
|
||||||
has_daily: false,
|
|
||||||
has_5min: false,
|
|
||||||
last_successful_fetch: None,
|
|
||||||
currency,
|
|
||||||
discovered_at: Some(chrono::Local::now().format("%Y-%m-%d").to_string()),
|
|
||||||
fetch_count: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn record_success(&mut self, has_daily: bool, has_5min: bool) {
|
|
||||||
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
|
|
||||||
|
|
||||||
self.has_daily |= has_daily;
|
|
||||||
self.has_5min |= has_5min;
|
|
||||||
self.last_successful_fetch = Some(today);
|
|
||||||
self.fetch_count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,146 +1,129 @@
|
|||||||
// src/corporate/update.rs
|
// src/corporate/update.rs
|
||||||
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
|
use super::{scraper::*, update_openfigi::*};
|
||||||
use crate::config::Config;
|
use crate::config::Config;
|
||||||
|
use crate::check_shutdown;
|
||||||
|
use crate::corporate::update_companies::update_companies;
|
||||||
|
use crate::corporate::update_companies_cleanse::{companies_yahoo_cleansed_low_profile, companies_yahoo_cleansed_no_data};
|
||||||
|
use crate::corporate::update_companies_enrich::{enrich_companies_with_events, enrich_companies_with_chart, enrich_companies_with_option};
|
||||||
|
use crate::corporate::collect_exchanges::collect_and_save_exchanges;
|
||||||
|
use crate::economic::yahoo_update_forex::collect_fx_rates;
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
use crate::scraper::webdriver::ChromeDriverPool;
|
use crate::scraper::webdriver::ChromeDriverPool;
|
||||||
|
use crate::scraper::yahoo::{YahooClientPool};
|
||||||
|
use crate::scraper::openfigi::load_figi_type_lists;
|
||||||
|
|
||||||
use chrono::Local;
|
use std::result::Result::Ok;
|
||||||
use std::collections::{HashMap};
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool};
|
||||||
|
|
||||||
/// Main function: Full update for all companies (LEI-based) with optimized parallel execution.
|
/// Main corporate update entry point with shutdown awareness
|
||||||
///
|
pub async fn run_full_update(
|
||||||
/// This function coordinates the entire update process:
|
config: &Config,
|
||||||
/// - Loads GLEIF mappings
|
pool: &Arc<ChromeDriverPool>,
|
||||||
/// - Builds FIGI-LEI map
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
/// - Loads existing events
|
) -> anyhow::Result<()> {
|
||||||
/// - Processes each company: discovers exchanges via FIGI, fetches prices & earnings, aggregates data
|
logger::log_info("=== Corporate Update ===").await;
|
||||||
/// - Uses the provided shared ChromeDriver pool for efficient parallel scraping
|
|
||||||
/// - Saves optimized events
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `config` - The application configuration.
|
|
||||||
/// * `pool` - Shared pool of ChromeDriver instances for scraping.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if any step in the update process fails.
|
|
||||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
|
||||||
println!("=== Starting LEI-based corporate full update ===");
|
|
||||||
|
|
||||||
// 1. Load fresh GLEIF ISIN ↔ LEI mapping
|
let paths = DataPaths::new(".")?;
|
||||||
let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
|
|
||||||
Ok(map) => map,
|
check_shutdown!(shutdown_flag);
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Warning: Could not load GLEIF ISIN↔LEI mapping: {}", e);
|
logger::log_info("Step 1: Downloading GLEIF CSV...").await;
|
||||||
HashMap::new()
|
let gleif_csv_path = match download_isin_lei_csv().await? {
|
||||||
|
Some(p) => {
|
||||||
|
logger::log_info(&format!(" ✓ GLEIF CSV at: {}", p)).await;
|
||||||
|
p
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
logger::log_warn(" ✗ Could not obtain GLEIF CSV").await;
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 2. Load OpenFIGI mapping value lists (cached)
|
check_shutdown!(shutdown_flag);
|
||||||
if let Err(e) = load_figi_type_lists().await {
|
|
||||||
eprintln!("Warning: Could not load OpenFIGI type lists: {}", e);
|
logger::log_info("Step 2: Loading OpenFIGI metadata...").await;
|
||||||
|
load_figi_type_lists(&paths).await.ok();
|
||||||
|
logger::log_info(" ✓ OpenFIGI metadata loaded").await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("Step 3: Checking LEI-FIGI mapping status...").await;
|
||||||
|
let all_mapped = update_lei_mapping(&paths, &gleif_csv_path, None).await?;
|
||||||
|
|
||||||
|
if !all_mapped {
|
||||||
|
logger::log_warn(" ⚠ Some LEIs failed to map - continuing with partial data").await;
|
||||||
|
} else {
|
||||||
|
logger::log_info(" ✓ All LEIs successfully mapped").await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Build FIGI → LEI map
|
check_shutdown!(shutdown_flag);
|
||||||
// # Attributes
|
|
||||||
// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
|
||||||
// * figi: metadata with ISIN as key
|
|
||||||
let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins).await {
|
|
||||||
Ok(map) => map,
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Warning: Could not build FIGI→LEI map: {}", e);
|
|
||||||
HashMap::new()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// 4. Load or build companies
|
logger::log_info("Step 4: Building securities map (streaming)...").await;
|
||||||
let mut companies = load_or_build_all_securities(&figi_to_lei).await?;
|
update_securities(&paths).await?;
|
||||||
println!("Processing {} companies", companies.0.len());
|
logger::log_info(" ✓ Securities map updated").await;
|
||||||
|
|
||||||
// 5. Load existing earnings events (for change detection)
|
let paths = DataPaths::new(".")?;
|
||||||
let today = Local::now().format("%Y-%m-%d").to_string();
|
|
||||||
let mut existing_events = match load_existing_events().await {
|
|
||||||
Ok(events) => events,
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Warning: Could not load existing events: {}", e);
|
|
||||||
HashMap::new()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// 5. Use the provided pool (no need to create a new one)
|
check_shutdown!(shutdown_flag);
|
||||||
let pool_size = pool.get_number_of_instances(); // Use the size from the shared pool
|
|
||||||
|
|
||||||
// Process companies in parallel using the shared pool
|
logger::log_info("Step 5: Building companies.jsonl with Yahoo Data...").await;
|
||||||
/*let results: Vec<_> = stream::iter(companies.into_iter())
|
let count = update_companies(&paths, pool, shutdown_flag, config, &None).await?;
|
||||||
.map(|company| {
|
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
||||||
let pool_clone = pool.clone();
|
|
||||||
async move {
|
|
||||||
process_company_data(&company, &pool_clone, &mut existing_events).await
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.buffer_unordered(pool_size)
|
|
||||||
.collect().await;
|
|
||||||
|
|
||||||
// Handle results (e.g., collect changes)
|
check_shutdown!(shutdown_flag);
|
||||||
let mut all_changes = Vec::new();
|
|
||||||
for result in results {
|
|
||||||
if let Ok(ProcessResult { changes }) = result {
|
|
||||||
all_changes.extend(changes);
|
|
||||||
}
|
|
||||||
}*/
|
|
||||||
|
|
||||||
save_optimized_events(existing_events).await?;
|
logger::log_info("Step 6: Cleansing companies with missing essential data...").await;
|
||||||
//save_changes(&all_changes).await?;
|
let cleansed_count = companies_yahoo_cleansed_no_data(&paths).await?;
|
||||||
|
logger::log_info(&format!(" ✓ {} companies found on Yahoo ready for further use in companies_yahoo.jsonl", cleansed_count)).await;
|
||||||
|
|
||||||
//println!("Corporate update complete — {} changes detected", all_changes.len());
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
let proxy_pool = pool.get_proxy_pool()
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("ChromeDriverPool must be created with VPN proxy rotation enabled"))?;
|
||||||
|
|
||||||
|
logger::log_info("Creating YahooClientPool with proxy rotation...").await;
|
||||||
|
let yahoo_pool = Arc::new(YahooClientPool::new(proxy_pool, config, None).await?);
|
||||||
|
logger::log_info(&format!("✓ YahooClientPool ready with {} clients", yahoo_pool.num_clients().await)).await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("Step 7: Cleansing companies with too low profile (with abort-safe persistence)...").await;
|
||||||
|
let cleansed_count = companies_yahoo_cleansed_low_profile(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
|
||||||
|
logger::log_info(&format!(" ✓ {} companies with sufficient profile ready for analytics", cleansed_count)).await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("Step 8: Enriching companies with Yahoo Events (with abort-safe persistence)...").await;
|
||||||
|
let enriched_count = enrich_companies_with_events(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
|
||||||
|
logger::log_info(&format!(" ✓ {} companies enriched with event data", enriched_count)).await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("Step 9: Enriching companies with Yahoo Options (with abort-safe persistence)...").await;
|
||||||
|
let options_count = enrich_companies_with_option(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
|
||||||
|
logger::log_info(&format!(" ✓ {} companies enriched with options data", options_count)).await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("Step 10: Enriching companies with Yahoo Chart (with abort-safe persistence)...").await;
|
||||||
|
let chart_count = enrich_companies_with_chart(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
|
||||||
|
logger::log_info(&format!(" ✓ {} companies enriched with chart data", chart_count)).await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("Step 11: Collecting FX rates...").await;
|
||||||
|
let fx_count = collect_fx_rates(&paths, config, yahoo_pool.clone(), shutdown_flag).await?;
|
||||||
|
logger::log_info(&format!(" ✓ Collected {} FX rates", fx_count)).await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("Step 12: Collecting exchange information...").await;
|
||||||
|
let exchange_count = collect_and_save_exchanges(&paths).await?;
|
||||||
|
logger::log_info(&format!(" ✓ Collected {} exchanges", exchange_count)).await;
|
||||||
|
|
||||||
|
logger::log_info("=== Corporate update complete === ").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ProcessResult {
|
|
||||||
pub changes: Vec<CompanyEventChange>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn process_batch(
|
|
||||||
new_events: &[CompanyEvent],
|
|
||||||
existing: &mut HashMap<String, CompanyEvent>,
|
|
||||||
today: &str,
|
|
||||||
) -> ProcessResult {
|
|
||||||
let mut changes = Vec::new();
|
|
||||||
|
|
||||||
for new in new_events {
|
|
||||||
let key = event_key(new);
|
|
||||||
|
|
||||||
if let Some(old) = existing.get(&key) {
|
|
||||||
changes.extend(detect_changes(old, new, today));
|
|
||||||
existing.insert(key, new.clone());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for time change on same date
|
|
||||||
let date_key = format!("{}|{}", new.ticker, new.date);
|
|
||||||
let mut found_old = None;
|
|
||||||
for (k, e) in existing.iter() {
|
|
||||||
if format!("{}|{}", e.ticker, e.date) == date_key && k != &key {
|
|
||||||
found_old = Some((k.clone(), e.clone()));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some((old_key, old_event)) = found_old {
|
|
||||||
if new.date.as_str() > today {
|
|
||||||
changes.push(CompanyEventChange {
|
|
||||||
ticker: new.ticker.clone(),
|
|
||||||
date: new.date.clone(),
|
|
||||||
field_changed: "time".to_string(),
|
|
||||||
old_value: old_event.time.clone(),
|
|
||||||
new_value: new.time.clone(),
|
|
||||||
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
existing.remove(&old_key);
|
|
||||||
}
|
|
||||||
|
|
||||||
existing.insert(key, new.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
ProcessResult { changes }
|
|
||||||
}
|
|
||||||
907
src/corporate/update_companies.rs
Normal file
907
src/corporate/update_companies.rs
Normal file
@@ -0,0 +1,907 @@
|
|||||||
|
// src/corporate/update_companies.rs
|
||||||
|
use super::{types::*, yahoo_company_extraction::*, helpers::*};
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::integrity::{DataStage, StateManager, file_reference};
|
||||||
|
use crate::util::logger;
|
||||||
|
use crate::scraper::webdriver::ChromeDriverPool;
|
||||||
|
use crate::scraper::hard_reset::perform_hard_reset;
|
||||||
|
use crate::corporate::checkpoint_helpers;
|
||||||
|
use crate::config::Config;
|
||||||
|
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use tokio::fs::OpenOptions;
|
||||||
|
use tokio::time::sleep;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::time::Duration;
|
||||||
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use anyhow::{anyhow, Result};
|
||||||
|
|
||||||
|
/// Represents a write command to be serialized through the log writer
|
||||||
|
enum LogCommand {
|
||||||
|
Write(CompanyData),
|
||||||
|
Checkpoint,
|
||||||
|
Shutdown,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result from processing a single company
|
||||||
|
struct CompanyProcessResult {
|
||||||
|
company: CompanyData,
|
||||||
|
is_update: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a company needs Yahoo data processing
|
||||||
|
/// Returns true if company has incomplete data (needs processing)
|
||||||
|
fn company_needs_processing(
|
||||||
|
company_name: &str,
|
||||||
|
company_info: &CompanyData,
|
||||||
|
existing_companies: &HashMap<String, CompanyData>,
|
||||||
|
) -> bool {
|
||||||
|
// If company not in existing data at all, definitely needs processing
|
||||||
|
let Some(existing_entry) = existing_companies.get(company_name) else {
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Collect all ISINs this company should have
|
||||||
|
let mut required_isins = std::collections::HashSet::new();
|
||||||
|
for figi_infos in company_info.securities.values() {
|
||||||
|
for figi_info in figi_infos {
|
||||||
|
if !figi_info.isin.is_empty() {
|
||||||
|
required_isins.insert(figi_info.isin.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check each required ISIN
|
||||||
|
for isin in required_isins {
|
||||||
|
// Check if this ISIN exists in the company's ticker map
|
||||||
|
if let Some(map) = &existing_entry.isin_tickers_map {
|
||||||
|
if let Some(tickers) = map.get(&isin) {
|
||||||
|
// Check if this ISIN has valid Yahoo data
|
||||||
|
let has_valid_yahoo = tickers.iter().any(|t| {
|
||||||
|
t.starts_with("YAHOO:") &&
|
||||||
|
t != "YAHOO:ERROR" //&& // Error marker means needs retry
|
||||||
|
//t != "YAHOO:NO_RESULTS" // This is actually valid (legitimately not found)
|
||||||
|
});
|
||||||
|
|
||||||
|
// If no valid Yahoo data for this ISIN, company needs processing
|
||||||
|
if !has_valid_yahoo {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// ISIN not in map at all, needs processing
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No isin_tickers_map at all, needs processing
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// All ISINs have valid Yahoo data, skip this company
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Abort-safe incremental JSONL persistence with proper hard reset handling
|
||||||
|
pub async fn update_companies(
|
||||||
|
paths: &DataPaths,
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
config: &Config,
|
||||||
|
monitoring: &Option<crate::monitoring::MonitoringHandle>,
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
|
// Configuration constants
|
||||||
|
const CHECKPOINT_INTERVAL: usize = 50;
|
||||||
|
const FSYNC_BATCH_SIZE: usize = 10;
|
||||||
|
const FSYNC_INTERVAL_SECS: u64 = 10;
|
||||||
|
const CONCURRENCY_LIMIT: usize = 100;
|
||||||
|
|
||||||
|
// Wrap pool in mutex for potential replacement
|
||||||
|
let pool_mutex = Arc::new(tokio::sync::Mutex::new(Arc::clone(pool)));
|
||||||
|
|
||||||
|
// Synchronization for hard reset
|
||||||
|
let reset_in_progress = Arc::new(tokio::sync::Mutex::new(false));
|
||||||
|
|
||||||
|
let securities_path = paths.figi_securities_dir();
|
||||||
|
let securities_checkpoint = securities_path.join("common_stocks.jsonl");
|
||||||
|
let securities_log = securities_path.join("common_stocks.log.jsonl");
|
||||||
|
|
||||||
|
if !securities_checkpoint.exists() {
|
||||||
|
logger::log_warn("No common_stocks.jsonl found").await;
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load securities from checkpoint and replay log
|
||||||
|
logger::log_info("Loading common stocks from JSONL checkpoint and log...").await;
|
||||||
|
let securities = load_securities_from_jsonl(&securities_checkpoint, &securities_log).await?;
|
||||||
|
logger::log_info(&format!("Loaded {} companies from common stocks", securities.len())).await;
|
||||||
|
|
||||||
|
let companies_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
let log_path = paths.data_dir().join("companies_updates.log");
|
||||||
|
|
||||||
|
if let Some(parent) = companies_path.parent() {
|
||||||
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let manager = StateManager::new(paths.integrity_dir()).await?;
|
||||||
|
let content_reference = file_reference(&companies_path);
|
||||||
|
let step_name = "corporate_companies_update";
|
||||||
|
let data_stage = DataStage::Data;
|
||||||
|
|
||||||
|
if manager.is_step_valid(step_name).await? {
|
||||||
|
logger::log_info(" Companies data already built and valid").await;
|
||||||
|
return Ok(securities.len());
|
||||||
|
}
|
||||||
|
logger::log_info(" Companies data incomplete or missing, proceeding with update").await;
|
||||||
|
let entry: crate::util::integrity::StateEntry = manager.create_entry(step_name.to_string(), content_reference, data_stage).await?;
|
||||||
|
|
||||||
|
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||||
|
let existing_companies = checkpoint_helpers::load_checkpoint_with_log(
|
||||||
|
&companies_path,
|
||||||
|
&log_path,
|
||||||
|
"companies.jsonl"
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
// === SETUP LOG WRITER TASK ===
|
||||||
|
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
|
||||||
|
|
||||||
|
let log_file_init = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let companies_path_clone = companies_path.clone();
|
||||||
|
let log_path_clone = log_path.clone();
|
||||||
|
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
|
||||||
|
|
||||||
|
// Clone the Arc for the writer task (Arc clone is cheap, just increments ref count)
|
||||||
|
let existing_companies_writer_for_task = Arc::clone(&existing_companies_writer);
|
||||||
|
|
||||||
|
let write_tx_for_writer = write_tx.clone();
|
||||||
|
let writer_task = tokio::spawn(async move {
|
||||||
|
let mut log_file = log_file_init;
|
||||||
|
let mut writes_since_fsync = 0;
|
||||||
|
let mut last_fsync = std::time::Instant::now();
|
||||||
|
let mut updates_since_checkpoint = 0;
|
||||||
|
let mut count = 0;
|
||||||
|
let mut new_count = 0;
|
||||||
|
let mut updated_count = 0;
|
||||||
|
|
||||||
|
while let Some(cmd) = write_rx.recv().await {
|
||||||
|
match cmd {
|
||||||
|
LogCommand::Write(company) => {
|
||||||
|
// Write to log
|
||||||
|
let line = serde_json::to_string(&company).unwrap();
|
||||||
|
if let Err(e) = log_file.write_all(line.as_bytes()).await {
|
||||||
|
logger::log_error(&format!("Failed to write to log: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.write_all(b"\n").await {
|
||||||
|
logger::log_error(&format!("Failed to write newline: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
writes_since_fsync += 1;
|
||||||
|
updates_since_checkpoint += 1;
|
||||||
|
count += 1;
|
||||||
|
|
||||||
|
// Update in-memory state
|
||||||
|
let mut existing_companies = existing_companies_writer_for_task.lock().await;
|
||||||
|
let is_update = existing_companies.contains_key(&company.name);
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
drop(existing_companies);
|
||||||
|
|
||||||
|
if is_update {
|
||||||
|
updated_count += 1;
|
||||||
|
} else {
|
||||||
|
new_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Batched + time-based fsync
|
||||||
|
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|
||||||
|
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
|
||||||
|
|
||||||
|
if should_fsync {
|
||||||
|
if let Err(e) = log_file.flush().await {
|
||||||
|
logger::log_error(&format!("Failed to flush: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.sync_data().await {
|
||||||
|
logger::log_error(&format!("Failed to fsync: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writes_since_fsync = 0;
|
||||||
|
last_fsync = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LogCommand::Checkpoint => {
|
||||||
|
if let Err(e) = log_file.flush().await {
|
||||||
|
logger::log_error(&format!("Failed to flush before checkpoint: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.sync_data().await {
|
||||||
|
logger::log_error(&format!("Failed to fsync before checkpoint: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let existing_companies = existing_companies_writer_for_task.lock().await;
|
||||||
|
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
|
||||||
|
drop(existing_companies);
|
||||||
|
|
||||||
|
let temp_path = companies_path_clone.with_extension("tmp");
|
||||||
|
match tokio::fs::File::create(&temp_path).await {
|
||||||
|
Ok(mut temp_file) => {
|
||||||
|
let mut checkpoint_ok = true;
|
||||||
|
for company in &companies_vec {
|
||||||
|
if let Ok(line) = serde_json::to_string(company) {
|
||||||
|
if temp_file.write_all(line.as_bytes()).await.is_err() ||
|
||||||
|
temp_file.write_all(b"\n").await.is_err() {
|
||||||
|
checkpoint_ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if checkpoint_ok {
|
||||||
|
if temp_file.flush().await.is_ok() &&
|
||||||
|
temp_file.sync_data().await.is_ok() {
|
||||||
|
drop(temp_file);
|
||||||
|
|
||||||
|
if tokio::fs::rename(&temp_path, &companies_path_clone).await.is_ok() {
|
||||||
|
if tokio::fs::remove_file(&log_path_clone).await.is_ok() {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Checkpoint created ({} companies), log cleared",
|
||||||
|
companies_vec.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
if let Ok(new_log) = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path_clone)
|
||||||
|
.await {
|
||||||
|
log_file = new_log;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("Failed to create checkpoint temp file: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
updates_since_checkpoint = 0;
|
||||||
|
}
|
||||||
|
LogCommand::Shutdown => {
|
||||||
|
logger::log_info("Writer shutting down...").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Periodic checkpoint trigger
|
||||||
|
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
|
||||||
|
let _ = write_tx.send(LogCommand::Checkpoint).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final fsync
|
||||||
|
let _ = log_file.flush().await;
|
||||||
|
let _ = log_file.sync_data().await;
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Writer finished: {} total ({} new, {} updated)",
|
||||||
|
count, new_count, updated_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
(count, new_count, updated_count)
|
||||||
|
});
|
||||||
|
|
||||||
|
// === MAIN PROCESSING LOOP ===
|
||||||
|
let total = securities.len();
|
||||||
|
logger::log_info(&format!("Processing {} companies with concurrency limit {}", total, CONCURRENCY_LIMIT)).await;
|
||||||
|
|
||||||
|
let mut tasks = FuturesUnordered::new();
|
||||||
|
|
||||||
|
// Build initial pending list with proper filtering
|
||||||
|
let mut pending: Vec<(String, CompanyData)> = securities.iter()
|
||||||
|
.filter(|(name, info)| company_needs_processing(name, info, &existing_companies))
|
||||||
|
.map(|(name, info)| (name.clone(), info.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Initial scan: {} companies need processing ({} already complete)",
|
||||||
|
pending.len(),
|
||||||
|
total - pending.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
let mut processed = 0;
|
||||||
|
let mut hard_reset_count = 0;
|
||||||
|
|
||||||
|
// Spawn initial batch
|
||||||
|
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
|
||||||
|
if let Some((name, company_info)) = pending.pop() {
|
||||||
|
let current_pool = {
|
||||||
|
let pool_guard = pool_mutex.lock().await;
|
||||||
|
Arc::clone(&*pool_guard)
|
||||||
|
};
|
||||||
|
|
||||||
|
let existing = existing_companies.get(&name).cloned();
|
||||||
|
let shutdown_flag_clone = Arc::clone(shutdown_flag);
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
process_single_company_validated(
|
||||||
|
name,
|
||||||
|
company_info,
|
||||||
|
existing,
|
||||||
|
¤t_pool,
|
||||||
|
&shutdown_flag_clone,
|
||||||
|
).await
|
||||||
|
});
|
||||||
|
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process results and spawn new tasks
|
||||||
|
while let Some(task_result) = tasks.next().await {
|
||||||
|
// Check for shutdown
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown signal received, stopping processing").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
match task_result {
|
||||||
|
Ok(Ok(Some(result))) => {
|
||||||
|
// Success: send to writer
|
||||||
|
let _ = write_tx_for_writer.send(LogCommand::Write(result.company)).await;
|
||||||
|
processed += 1;
|
||||||
|
|
||||||
|
// Log progress every 100 companies
|
||||||
|
if processed % 100 == 0 {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Progress: {}/{} companies processed ({} resets)",
|
||||||
|
processed,
|
||||||
|
total,
|
||||||
|
hard_reset_count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spawn next task if available
|
||||||
|
if let Some((name, company_info)) = pending.pop() {
|
||||||
|
let current_pool = {
|
||||||
|
let pool_guard = pool_mutex.lock().await;
|
||||||
|
Arc::clone(&*pool_guard)
|
||||||
|
};
|
||||||
|
|
||||||
|
let existing = existing_companies.get(&name).cloned();
|
||||||
|
let shutdown_flag_clone = Arc::clone(shutdown_flag);
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
process_single_company_validated(
|
||||||
|
name,
|
||||||
|
company_info,
|
||||||
|
existing,
|
||||||
|
¤t_pool,
|
||||||
|
&shutdown_flag_clone,
|
||||||
|
).await
|
||||||
|
});
|
||||||
|
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Ok(None)) => {
|
||||||
|
// No result (shutdown or skip)
|
||||||
|
processed += 1;
|
||||||
|
|
||||||
|
if let Some((name, company_info)) = pending.pop() {
|
||||||
|
let current_pool = {
|
||||||
|
let pool_guard = pool_mutex.lock().await;
|
||||||
|
Arc::clone(&*pool_guard)
|
||||||
|
};
|
||||||
|
|
||||||
|
let existing = existing_companies.get(&name).cloned();
|
||||||
|
let shutdown_flag_clone = Arc::clone(shutdown_flag);
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
process_single_company_validated(
|
||||||
|
name,
|
||||||
|
company_info,
|
||||||
|
existing,
|
||||||
|
¤t_pool,
|
||||||
|
&shutdown_flag_clone,
|
||||||
|
).await
|
||||||
|
});
|
||||||
|
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
let error_msg = e.to_string();
|
||||||
|
|
||||||
|
if error_msg.contains("HARD_RESET_REQUIRED") {
|
||||||
|
// Check if reset already in progress (race condition protection)
|
||||||
|
let mut reset_lock = reset_in_progress.lock().await;
|
||||||
|
if *reset_lock {
|
||||||
|
logger::log_info("Hard reset already in progress, skipping duplicate").await;
|
||||||
|
processed += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
*reset_lock = true;
|
||||||
|
drop(reset_lock); // Release lock during reset
|
||||||
|
|
||||||
|
logger::log_error("🔴 HARD RESET THRESHOLD REACHED - INITIATING RESET SEQUENCE").await;
|
||||||
|
logger::log_warn("Draining active tasks before hard reset...").await;
|
||||||
|
|
||||||
|
// Save remaining pending count
|
||||||
|
let remaining_count = pending.len();
|
||||||
|
|
||||||
|
// Stop spawning new tasks
|
||||||
|
pending.clear();
|
||||||
|
|
||||||
|
// Wait for all active tasks to complete
|
||||||
|
let mut drained = 0;
|
||||||
|
while let Some(_) = tasks.next().await {
|
||||||
|
drained += 1;
|
||||||
|
if drained % 10 == 0 {
|
||||||
|
logger::log_info(&format!("Drained {} tasks...", drained)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"All tasks drained ({} active). {} companies need reprocessing.",
|
||||||
|
drained,
|
||||||
|
remaining_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Perform the actual hard reset
|
||||||
|
match perform_hard_reset(&pool_mutex, config, paths, monitoring, shutdown_flag).await {
|
||||||
|
Ok(()) => {
|
||||||
|
logger::log_info("✅ Hard reset completed successfully").await;
|
||||||
|
hard_reset_count += 1;
|
||||||
|
|
||||||
|
// Reset the error counter
|
||||||
|
{
|
||||||
|
let pool_guard = pool_mutex.lock().await;
|
||||||
|
let current_pool = Arc::clone(&*pool_guard);
|
||||||
|
current_pool.get_reset_controller().reset();
|
||||||
|
}
|
||||||
|
logger::log_info("✓ Error counter cleared").await;
|
||||||
|
|
||||||
|
// Rebuild pending list by checking which companies need processing
|
||||||
|
logger::log_info("Rebuilding pending queue with proper Yahoo data checks...").await;
|
||||||
|
|
||||||
|
// Get current state of written companies
|
||||||
|
let current_existing = {
|
||||||
|
let companies = existing_companies_writer.lock().await;
|
||||||
|
companies.clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Reload all securities from disk (checkpoint + log)
|
||||||
|
logger::log_info("Reloading securities from JSONL...").await;
|
||||||
|
let all_securities = load_securities_from_jsonl(&securities_checkpoint, &securities_log).await?;
|
||||||
|
logger::log_info(&format!("Reloaded {} companies", all_securities.len())).await;
|
||||||
|
|
||||||
|
// Build pending list: only companies that need processing
|
||||||
|
pending = all_securities.iter()
|
||||||
|
.filter(|(name, info)| company_needs_processing(name, info, ¤t_existing))
|
||||||
|
.map(|(name, info)| (name.clone(), info.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Restarting with {} remaining companies (out of {} total)",
|
||||||
|
pending.len(),
|
||||||
|
total
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Only continue if there's work to do
|
||||||
|
if pending.is_empty() {
|
||||||
|
logger::log_info("All companies have complete data, exiting").await;
|
||||||
|
|
||||||
|
// Clear reset flag
|
||||||
|
let mut reset_lock = reset_in_progress.lock().await;
|
||||||
|
*reset_lock = false;
|
||||||
|
drop(reset_lock);
|
||||||
|
|
||||||
|
break; // Exit main loop
|
||||||
|
}
|
||||||
|
|
||||||
|
// Respawn initial batch with NEW pool
|
||||||
|
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
|
||||||
|
if let Some((name, company_info)) = pending.pop() {
|
||||||
|
let current_pool = {
|
||||||
|
let pool_guard = pool_mutex.lock().await;
|
||||||
|
Arc::clone(&*pool_guard)
|
||||||
|
};
|
||||||
|
|
||||||
|
let existing = existing_companies.get(&name).cloned();
|
||||||
|
let shutdown_flag_clone = Arc::clone(shutdown_flag);
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
process_single_company_validated(
|
||||||
|
name,
|
||||||
|
company_info,
|
||||||
|
existing,
|
||||||
|
¤t_pool,
|
||||||
|
&shutdown_flag_clone,
|
||||||
|
).await
|
||||||
|
});
|
||||||
|
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear reset flag
|
||||||
|
let mut reset_lock = reset_in_progress.lock().await;
|
||||||
|
*reset_lock = false;
|
||||||
|
drop(reset_lock);
|
||||||
|
|
||||||
|
// ✅ Continue processing (don't spawn duplicate task)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Err(reset_err) => {
|
||||||
|
logger::log_error(&format!("Hard reset failed: {}", reset_err)).await;
|
||||||
|
|
||||||
|
// Clear reset flag
|
||||||
|
let mut reset_lock = reset_in_progress.lock().await;
|
||||||
|
*reset_lock = false;
|
||||||
|
drop(reset_lock);
|
||||||
|
|
||||||
|
// Exit if hard reset fails
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Regular error
|
||||||
|
logger::log_warn(&format!("Company processing error: {}", error_msg)).await;
|
||||||
|
processed += 1;
|
||||||
|
|
||||||
|
// Spawn next task
|
||||||
|
if let Some((name, company_info)) = pending.pop() {
|
||||||
|
let current_pool = {
|
||||||
|
let pool_guard = pool_mutex.lock().await;
|
||||||
|
Arc::clone(&*pool_guard)
|
||||||
|
};
|
||||||
|
|
||||||
|
let existing = existing_companies.get(&name).cloned();
|
||||||
|
let shutdown_flag_clone = Arc::clone(shutdown_flag);
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
process_single_company_validated(
|
||||||
|
name,
|
||||||
|
company_info,
|
||||||
|
existing,
|
||||||
|
¤t_pool,
|
||||||
|
&shutdown_flag_clone,
|
||||||
|
).await
|
||||||
|
});
|
||||||
|
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Task panic
|
||||||
|
logger::log_error(&format!("Task panic: {}", e)).await;
|
||||||
|
processed += 1;
|
||||||
|
|
||||||
|
// Spawn next task
|
||||||
|
if let Some((name, company_info)) = pending.pop() {
|
||||||
|
let current_pool = {
|
||||||
|
let pool_guard = pool_mutex.lock().await;
|
||||||
|
Arc::clone(&*pool_guard)
|
||||||
|
};
|
||||||
|
|
||||||
|
let existing = existing_companies.get(&name).cloned();
|
||||||
|
let shutdown_flag_clone = Arc::clone(shutdown_flag);
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
process_single_company_validated(
|
||||||
|
name,
|
||||||
|
company_info,
|
||||||
|
existing,
|
||||||
|
¤t_pool,
|
||||||
|
&shutdown_flag_clone,
|
||||||
|
).await
|
||||||
|
});
|
||||||
|
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("Main processing loop completed").await;
|
||||||
|
|
||||||
|
// Signal writer to finish
|
||||||
|
let _ = write_tx_for_writer.send(LogCommand::Checkpoint).await;
|
||||||
|
let _ = write_tx_for_writer.send(LogCommand::Shutdown).await;
|
||||||
|
drop(write_tx_for_writer);
|
||||||
|
|
||||||
|
// Wait for writer to finish
|
||||||
|
let (final_count, final_new, final_updated) = writer_task.await
|
||||||
|
.unwrap_or((0, 0, 0));
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✅ Completed: {} total companies ({} new, {} updated, {} hard resets)",
|
||||||
|
final_count, final_new, final_updated, hard_reset_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Track completion with:
|
||||||
|
// - Content reference: All output JSONL files
|
||||||
|
// - Data stage: Data (7-day TTL) - Securities data relatively stable
|
||||||
|
// - Dependencies: LEI-FIGI mapping must be valid
|
||||||
|
|
||||||
|
// Check for shutdown BEFORE marking complete
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected during company update - marking as invalid for retry").await;
|
||||||
|
manager.mark_invalid(
|
||||||
|
entry,
|
||||||
|
format!("Invalid: processed {} of {} companies before shutdown", final_count, total),
|
||||||
|
).await?;
|
||||||
|
} else {
|
||||||
|
// Only mark complete if we got here without shutdown
|
||||||
|
manager.mark_valid(entry).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(final_count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Loads CompanyInfo securities from checkpoint and log JSONL files
|
||||||
|
async fn load_securities_from_jsonl(
|
||||||
|
checkpoint_path: &std::path::Path,
|
||||||
|
log_path: &std::path::Path,
|
||||||
|
) -> anyhow::Result<HashMap<String, CompanyData>> {
|
||||||
|
let mut securities: HashMap<String, CompanyData> = HashMap::new();
|
||||||
|
|
||||||
|
// Load checkpoint
|
||||||
|
if checkpoint_path.exists() {
|
||||||
|
let content = tokio::fs::read_to_string(checkpoint_path).await?;
|
||||||
|
|
||||||
|
for (line_num, line) in content.lines().enumerate() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyData>(line) {
|
||||||
|
Ok(company_info) => {
|
||||||
|
securities.insert(company_info.name.clone(), company_info);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Skipping invalid line {} in checkpoint: {}",
|
||||||
|
line_num + 1, e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replay log (overwrites checkpoint entries if they exist)
|
||||||
|
if log_path.exists() {
|
||||||
|
let content = tokio::fs::read_to_string(log_path).await?;
|
||||||
|
|
||||||
|
for (line_num, line) in content.lines().enumerate() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyData>(line) {
|
||||||
|
Ok(company_info) => {
|
||||||
|
securities.insert(company_info.name.clone(), company_info);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Skipping invalid line {} in log: {}",
|
||||||
|
line_num + 1, e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(securities)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scrape with retry, validation, and shutdown awareness
|
||||||
|
async fn scrape_with_retry(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
isin: &str,
|
||||||
|
max_retries: u32,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> Result<Option<YahooCompanyData>> {
|
||||||
|
let mut retries = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
// Check shutdown before each attempt
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if pool.should_perform_hard_reset() {
|
||||||
|
logger::log_error("HARD_RESET_REQUIRED detected before scrape attempt").await;
|
||||||
|
return Err(anyhow!("HARD_RESET_REQUIRED"));
|
||||||
|
}
|
||||||
|
|
||||||
|
match scrape_company_details_by_isin(pool, isin, shutdown_flag).await {
|
||||||
|
Ok(result) => return Ok(result),
|
||||||
|
Err(e) => {
|
||||||
|
// Check if this is a hard reset required error
|
||||||
|
let error_msg = e.to_string();
|
||||||
|
if error_msg.contains("HARD_RESET_REQUIRED") {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"Hard reset required error for ISIN {}, propagating immediately",
|
||||||
|
isin
|
||||||
|
)).await;
|
||||||
|
return Err(e); // Propagate immediately, don't retry
|
||||||
|
}
|
||||||
|
|
||||||
|
if retries >= max_retries {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"All {} retries exhausted for ISIN {}: {}",
|
||||||
|
max_retries, isin, e
|
||||||
|
)).await;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let backoff_ms = 1000 * 2u64.pow(retries);
|
||||||
|
let jitter_ms = random_range(0, 500);
|
||||||
|
let total_delay = backoff_ms + jitter_ms;
|
||||||
|
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Retry {}/{} for ISIN {} after {}ms: {}",
|
||||||
|
retries + 1, max_retries, isin, total_delay, e
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(total_delay)).await;
|
||||||
|
retries += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process single company with validation and shutdown checks
|
||||||
|
async fn process_single_company_validated(
|
||||||
|
name: String,
|
||||||
|
company_info: CompanyData,
|
||||||
|
existing_entry: Option<CompanyData>,
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<Option<CompanyProcessResult>> {
|
||||||
|
// Check shutdown at start
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!("Shutdown detected, skipping company: {}", name)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let is_update = existing_entry.is_some();
|
||||||
|
|
||||||
|
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
||||||
|
existing_entry
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|e| e.isin_tickers_map.clone())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Collect unique ISIN-ticker pairs
|
||||||
|
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
||||||
|
|
||||||
|
for figi_infos in company_info.securities.values() {
|
||||||
|
for figi_info in figi_infos {
|
||||||
|
if !figi_info.isin.is_empty() {
|
||||||
|
let tickers = unique_isin_ticker_pairs
|
||||||
|
.entry(figi_info.isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
||||||
|
tickers.push(figi_info.ticker.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process each ISIN independently with per-ISIN status checking
|
||||||
|
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
||||||
|
// Check shutdown before each ISIN
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Shutdown detected while processing company: {}",
|
||||||
|
name
|
||||||
|
)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let tickers = isin_tickers_map
|
||||||
|
.entry(isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
for figi_ticker in figi_tickers {
|
||||||
|
if !tickers.contains(&figi_ticker) {
|
||||||
|
tickers.push(figi_ticker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if THIS SPECIFIC ISIN has valid Yahoo data (not ERROR)
|
||||||
|
let has_valid_yahoo = tickers.iter().any(|t| {
|
||||||
|
t.starts_with("YAHOO:") && t != "YAHOO:ERROR"
|
||||||
|
// Note: YAHOO:NO_RESULTS is valid (legitimately not found)
|
||||||
|
});
|
||||||
|
|
||||||
|
if !has_valid_yahoo {
|
||||||
|
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
||||||
|
tickers.retain(|t| !t.starts_with("YAHOO:"));
|
||||||
|
|
||||||
|
match scrape_with_retry(pool, &isin, 3, shutdown_flag).await {
|
||||||
|
Ok(Some(details)) => {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Found Yahoo ticker {} for ISIN {} (company: {})",
|
||||||
|
details.ticker, isin, name
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
tickers.push(format!("YAHOO:{}", details.ticker));
|
||||||
|
},
|
||||||
|
Ok(None) => {
|
||||||
|
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
|
||||||
|
tickers.push("YAHOO:NO_RESULTS".to_string());
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!("Shutdown during scrape for ISIN {}", isin)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this is a hard reset required error
|
||||||
|
let error_msg = e.to_string();
|
||||||
|
if error_msg.contains("HARD_RESET_REQUIRED") {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"Hard reset required during ISIN {} processing, propagating error",
|
||||||
|
isin
|
||||||
|
)).await;
|
||||||
|
return Err(e); // ← CRITICAL: Propagate immediately
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"✗ Yahoo lookup error for ISIN {} (company: {}): {}",
|
||||||
|
isin, name, e
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Mark this ISIN as failed to enable retry
|
||||||
|
tickers.push("YAHOO:ERROR".to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final shutdown check before returning result
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
"Shutdown detected, discarding incomplete result for: {}",
|
||||||
|
name
|
||||||
|
)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
if pool.should_perform_hard_reset() {
|
||||||
|
logger::log_error("HARD_RESET_REQUIRED detected during company processing").await;
|
||||||
|
return Err(anyhow!("HARD_RESET_REQUIRED"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isin_tickers_map.is_empty() {
|
||||||
|
let company_entry = CompanyData {
|
||||||
|
name: name.clone(),
|
||||||
|
primary_isin: company_info.primary_isin.clone(),
|
||||||
|
securities: company_info.securities.clone(),
|
||||||
|
yahoo_company_data: company_info.yahoo_company_data.clone(),
|
||||||
|
isin_tickers_map: Some(isin_tickers_map),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Some(CompanyProcessResult {
|
||||||
|
company: company_entry,
|
||||||
|
is_update,
|
||||||
|
}))
|
||||||
|
} else {
|
||||||
|
logger::log_warn(&format!("No ISINs found for company: {}", name)).await;
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
911
src/corporate/update_companies_cleanse.rs
Normal file
911
src/corporate/update_companies_cleanse.rs
Normal file
@@ -0,0 +1,911 @@
|
|||||||
|
// src/corporate/update_companies_cleanse.rs
|
||||||
|
use super::{helpers::*, types::*};
|
||||||
|
use crate::config::Config;
|
||||||
|
use crate::corporate::checkpoint_helpers;
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::integrity::{DataStage, StateManager, file_reference};
|
||||||
|
use crate::util::logger;
|
||||||
|
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
|
||||||
|
|
||||||
|
use std::result::Result::Ok;
|
||||||
|
use chrono::{Utc};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||||
|
use tokio::fs::{File, OpenOptions};
|
||||||
|
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
|
||||||
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
|
||||||
|
/// Result of processing a single company
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum CompanyProcessResult {
|
||||||
|
Valid(CompanyData),
|
||||||
|
FilteredLowCap { name: String, market_cap: f64 },
|
||||||
|
FilteredNoPrice { name: String },
|
||||||
|
Failed { company: CompanyData, error: String, is_transient: bool },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a write command to be serialized through the log writer
|
||||||
|
enum LogCommand {
|
||||||
|
Write(CompanyData),
|
||||||
|
Checkpoint,
|
||||||
|
Shutdown,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Cleansing function to remove companies with missing essential yahoo data for integrity
|
||||||
|
pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize, anyhow::Error> {
|
||||||
|
let data_path = paths.data_dir();
|
||||||
|
|
||||||
|
let input_path = data_path.join("companies.jsonl");
|
||||||
|
let output_path = data_path.join("companies_yahoo.jsonl");
|
||||||
|
|
||||||
|
if !input_path.exists() {
|
||||||
|
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let manager = StateManager::new(paths.integrity_dir()).await?;
|
||||||
|
let step_name = "yahoo_companies_cleansed_no_data";
|
||||||
|
let content_reference = file_reference(&output_path);
|
||||||
|
|
||||||
|
if manager.is_step_valid(step_name).await? {
|
||||||
|
let output_content = tokio::fs::read_to_string(&output_path).await?;
|
||||||
|
let count = output_content.lines()
|
||||||
|
.filter(|line| !line.trim().is_empty())
|
||||||
|
.count();
|
||||||
|
|
||||||
|
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo.jsonl", count)).await;
|
||||||
|
return Ok(count);
|
||||||
|
}
|
||||||
|
let entry = manager.create_entry(
|
||||||
|
step_name.to_string(),
|
||||||
|
content_reference.clone(),
|
||||||
|
DataStage::Data,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
logger::log_info(" Cleansing companies with missing Yahoo data...").await;
|
||||||
|
|
||||||
|
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
|
||||||
|
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
|
||||||
|
|
||||||
|
let file = File::open(&input_path).await?;
|
||||||
|
let reader = BufReader::new(file);
|
||||||
|
let mut lines = reader.lines();
|
||||||
|
|
||||||
|
let mut output_file = File::create(&output_path).await?;
|
||||||
|
let mut valid_count = 0;
|
||||||
|
let mut removed_count = 0;
|
||||||
|
let mut total_count = 0;
|
||||||
|
|
||||||
|
while let Some(line) = lines.next_line().await? {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
total_count += 1;
|
||||||
|
|
||||||
|
let company: CompanyData = match serde_json::from_str(&line) {
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let has_valid_yahoo = company.isin_tickers_map
|
||||||
|
.as_ref()
|
||||||
|
.map(|map| {
|
||||||
|
map.values()
|
||||||
|
.flatten()
|
||||||
|
.any(|ticker| {
|
||||||
|
ticker.starts_with("YAHOO:")
|
||||||
|
&& ticker != "YAHOO:NO_RESULTS"
|
||||||
|
&& ticker != "YAHOO:ERROR"
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if has_valid_yahoo {
|
||||||
|
let json_line = serde_json::to_string(&company)?;
|
||||||
|
output_file.write_all(json_line.as_bytes()).await?;
|
||||||
|
output_file.write_all(b"\n").await?;
|
||||||
|
valid_count += 1;
|
||||||
|
} else {
|
||||||
|
removed_count += 1;
|
||||||
|
if removed_count <= 5 {
|
||||||
|
logger::log_info(&format!(" Removed company '{}' (no valid Yahoo ticker)", company.name)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if total_count % 1000 == 0 {
|
||||||
|
logger::log_info(&format!(" Processed {} companies...", total_count)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
output_file.flush().await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" ✓ Cleansing complete: {} total → {} valid, {} removed",
|
||||||
|
total_count, valid_count, removed_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Track completion with:
|
||||||
|
// - Content reference: All event directories
|
||||||
|
// - Data stage: Data (7-day TTL by default)
|
||||||
|
// - Dependencies: Depends on cleaned companies data
|
||||||
|
manager.mark_valid(entry).await?;
|
||||||
|
|
||||||
|
Ok(valid_count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Yahoo Low Profile Cleansing WITH ABORT-SAFE INCREMENTAL PERSISTENCE
|
||||||
|
///
|
||||||
|
/// # Features
|
||||||
|
/// - Graceful shutdown (abort-safe)
|
||||||
|
/// - Task panic isolation (tasks fail independently)
|
||||||
|
/// - Crash-safe persistence (checkpoint + log with fsync)
|
||||||
|
/// - Smart skip logic (only process incomplete data)
|
||||||
|
/// - Uses pending queue instead of retry mechanism
|
||||||
|
/// - Reuses companies_update.log for persistence
|
||||||
|
///
|
||||||
|
/// # Persistence Strategy
|
||||||
|
/// - Checkpoint: companies_yahoo_cleaned.jsonl (atomic state)
|
||||||
|
/// - Log: companies_update.log (append-only updates)
|
||||||
|
/// - On restart: Load checkpoint + replay log
|
||||||
|
/// - Periodic checkpoints (every 50 companies)
|
||||||
|
/// - Batched fsync (every 10 writes or 10 seconds)
|
||||||
|
pub async fn companies_yahoo_cleansed_low_profile(
|
||||||
|
paths: &DataPaths,
|
||||||
|
_config: &Config,
|
||||||
|
yahoo_pool: Arc<YahooClientPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
|
// Configuration constants
|
||||||
|
const CHECKPOINT_INTERVAL: usize = 50;
|
||||||
|
const FSYNC_BATCH_SIZE: usize = 10;
|
||||||
|
const FSYNC_INTERVAL_SECS: u64 = 10;
|
||||||
|
const CONCURRENCY_LIMIT: usize = 50; // Limit parallel validation tasks
|
||||||
|
|
||||||
|
let data_path = paths.data_dir();
|
||||||
|
|
||||||
|
// File paths (reusing companies_update.log)
|
||||||
|
let input_path = data_path.join("companies_yahoo.jsonl");
|
||||||
|
let checkpoint_path = data_path.join("companies_yahoo_cleaned.jsonl");
|
||||||
|
let log_path = data_path.join("companies_updates.log");
|
||||||
|
|
||||||
|
// Check input exists
|
||||||
|
if !input_path.exists() {
|
||||||
|
logger::log_warn(" companies_yahoo.jsonl not found, skipping low profile cleansing").await;
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let manager = StateManager::new(paths.integrity_dir()).await?;
|
||||||
|
let step_name = "yahoo_companies_cleansed_low_profile";
|
||||||
|
let content_reference = file_reference(&checkpoint_path);
|
||||||
|
|
||||||
|
if manager.is_step_valid(step_name).await? {
|
||||||
|
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
|
||||||
|
let count = checkpoint_content.lines()
|
||||||
|
.filter(|line| !line.trim().is_empty())
|
||||||
|
.count();
|
||||||
|
|
||||||
|
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo_cleaned.jsonl", count)).await;
|
||||||
|
return Ok(count);
|
||||||
|
}
|
||||||
|
let entry = manager.create_entry(
|
||||||
|
step_name.to_string(),
|
||||||
|
content_reference.clone(),
|
||||||
|
DataStage::Data,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
logger::log_info(" Cleansing companies with low Yahoo profile...").await;
|
||||||
|
|
||||||
|
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||||
|
let mut existing_companies: HashMap<String, CompanyData> = HashMap::new();
|
||||||
|
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||||
|
|
||||||
|
if checkpoint_path.exists() {
|
||||||
|
logger::log_info("Loading checkpoint from companies_yahoo_cleaned.jsonl...").await;
|
||||||
|
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
|
||||||
|
|
||||||
|
for line in checkpoint_content.lines() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyData>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
processed_names.insert(company.name.clone());
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
if log_path.exists() {
|
||||||
|
logger::log_info("Replaying update log...").await;
|
||||||
|
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||||
|
let mut replayed = 0;
|
||||||
|
|
||||||
|
for line in log_content.lines() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<CompanyData>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
processed_names.insert(company.name.clone());
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
replayed += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if replayed > 0 {
|
||||||
|
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// === LOAD INPUT COMPANIES ===
|
||||||
|
logger::log_info(&format!("Loading companies from: {:?}", input_path)).await;
|
||||||
|
let input_companies = load_companies_from_jsonl(&input_path).await?;
|
||||||
|
logger::log_info(&format!("Loaded {} companies from input", input_companies.len())).await;
|
||||||
|
|
||||||
|
// === BUILD PENDING LIST (smart skip logic) ===
|
||||||
|
let mut pending: Vec<CompanyData> = input_companies
|
||||||
|
.into_iter()
|
||||||
|
.filter(|company| company_needs_processing(company, &existing_companies))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Initial scan: {} companies need processing ({} already complete)",
|
||||||
|
pending.len(),
|
||||||
|
existing_companies.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// === CONSOLIDATE LOG BEFORE EARLY EXIT ===
|
||||||
|
if pending.is_empty() {
|
||||||
|
logger::log_info(" ✓ All companies already processed").await;
|
||||||
|
|
||||||
|
// Consolidate log into checkpoint before exiting
|
||||||
|
if checkpoint_helpers::log_has_content(&log_path).await {
|
||||||
|
checkpoint_helpers::consolidate_checkpoint(&checkpoint_path, &log_path, &existing_companies).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(existing_companies.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
// === SETUP LOG WRITER TASK ===
|
||||||
|
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
|
||||||
|
|
||||||
|
let log_file_init = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let checkpoint_path_clone = checkpoint_path.clone();
|
||||||
|
let log_path_clone = log_path.clone();
|
||||||
|
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
|
||||||
|
let existing_companies_writer_for_task = Arc::clone(&existing_companies_writer);
|
||||||
|
|
||||||
|
let write_tx_for_writer = write_tx.clone();
|
||||||
|
let writer_task = tokio::spawn(async move {
|
||||||
|
let mut log_file = log_file_init;
|
||||||
|
let mut writes_since_fsync = 0;
|
||||||
|
let mut last_fsync = std::time::Instant::now();
|
||||||
|
let mut updates_since_checkpoint = 0;
|
||||||
|
let mut count = 0;
|
||||||
|
let mut new_count = 0;
|
||||||
|
let mut updated_count = 0;
|
||||||
|
|
||||||
|
while let Some(cmd) = write_rx.recv().await {
|
||||||
|
match cmd {
|
||||||
|
LogCommand::Write(company) => {
|
||||||
|
// Write to log
|
||||||
|
let line = serde_json::to_string(&company).unwrap();
|
||||||
|
if let Err(e) = log_file.write_all(line.as_bytes()).await {
|
||||||
|
logger::log_error(&format!("Failed to write to log: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.write_all(b"\n").await {
|
||||||
|
logger::log_error(&format!("Failed to write newline: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
writes_since_fsync += 1;
|
||||||
|
updates_since_checkpoint += 1;
|
||||||
|
count += 1;
|
||||||
|
|
||||||
|
// Update in-memory state
|
||||||
|
let mut existing_companies = existing_companies_writer_for_task.lock().await;
|
||||||
|
let is_update = existing_companies.contains_key(&company.name);
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
drop(existing_companies);
|
||||||
|
|
||||||
|
if is_update {
|
||||||
|
updated_count += 1;
|
||||||
|
} else {
|
||||||
|
new_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Batched + time-based fsync
|
||||||
|
let should_fsync = writes_since_fsync >= FSYNC_BATCH_SIZE
|
||||||
|
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS;
|
||||||
|
|
||||||
|
if should_fsync {
|
||||||
|
if let Err(e) = log_file.flush().await {
|
||||||
|
logger::log_error(&format!("Failed to flush: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.sync_data().await {
|
||||||
|
logger::log_error(&format!("Failed to fsync: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writes_since_fsync = 0;
|
||||||
|
last_fsync = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LogCommand::Checkpoint => {
|
||||||
|
if let Err(e) = log_file.flush().await {
|
||||||
|
logger::log_error(&format!("Failed to flush before checkpoint: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Err(e) = log_file.sync_data().await {
|
||||||
|
logger::log_error(&format!("Failed to fsync before checkpoint: {}", e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let existing_companies = existing_companies_writer_for_task.lock().await;
|
||||||
|
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
|
||||||
|
drop(existing_companies);
|
||||||
|
|
||||||
|
let temp_path = checkpoint_path_clone.with_extension("tmp");
|
||||||
|
match tokio::fs::File::create(&temp_path).await {
|
||||||
|
Ok(mut temp_file) => {
|
||||||
|
let mut checkpoint_ok = true;
|
||||||
|
for company in &companies_vec {
|
||||||
|
if let Ok(line) = serde_json::to_string(company) {
|
||||||
|
if temp_file.write_all(line.as_bytes()).await.is_err() ||
|
||||||
|
temp_file.write_all(b"\n").await.is_err() {
|
||||||
|
checkpoint_ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if checkpoint_ok {
|
||||||
|
if temp_file.flush().await.is_ok() &&
|
||||||
|
temp_file.sync_data().await.is_ok() {
|
||||||
|
drop(temp_file);
|
||||||
|
|
||||||
|
if tokio::fs::rename(&temp_path, &checkpoint_path_clone).await.is_ok() {
|
||||||
|
if tokio::fs::remove_file(&log_path_clone).await.is_ok() {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Checkpoint created ({} companies), log cleared",
|
||||||
|
companies_vec.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
if let Ok(new_log) = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path_clone)
|
||||||
|
.await {
|
||||||
|
log_file = new_log;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("Failed to create checkpoint temp file: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
updates_since_checkpoint = 0;
|
||||||
|
}
|
||||||
|
LogCommand::Shutdown => {
|
||||||
|
logger::log_info("Writer shutting down...").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Periodic checkpoint trigger
|
||||||
|
if updates_since_checkpoint >= CHECKPOINT_INTERVAL {
|
||||||
|
let _ = write_tx_for_writer.send(LogCommand::Checkpoint).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final fsync
|
||||||
|
let _ = log_file.flush().await;
|
||||||
|
let _ = log_file.sync_data().await;
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Writer finished: {} total ({} new, {} updated)",
|
||||||
|
count, new_count, updated_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
(count, new_count, updated_count)
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wrap paths in Arc for safe sharing across tasks
|
||||||
|
let paths = Arc::new((*paths).clone());
|
||||||
|
|
||||||
|
// === MAIN PROCESSING LOOP WITH TASK PANIC ISOLATION ===
|
||||||
|
let total = pending.len();
|
||||||
|
let mut tasks = FuturesUnordered::new();
|
||||||
|
|
||||||
|
// Counters
|
||||||
|
let processed = Arc::new(AtomicUsize::new(0));
|
||||||
|
let valid_count = Arc::new(AtomicUsize::new(0));
|
||||||
|
let filtered_low_cap = Arc::new(AtomicUsize::new(0));
|
||||||
|
let filtered_no_price = Arc::new(AtomicUsize::new(0));
|
||||||
|
let failed_count = Arc::new(AtomicUsize::new(0));
|
||||||
|
|
||||||
|
// Spawn initial batch
|
||||||
|
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
|
||||||
|
if let Some(company) = pending.pop() {
|
||||||
|
spawn_validation_task(
|
||||||
|
company,
|
||||||
|
&yahoo_pool,
|
||||||
|
&paths,
|
||||||
|
&write_tx,
|
||||||
|
shutdown_flag,
|
||||||
|
&processed,
|
||||||
|
&valid_count,
|
||||||
|
&filtered_low_cap,
|
||||||
|
&filtered_no_price,
|
||||||
|
&failed_count,
|
||||||
|
total,
|
||||||
|
&mut tasks,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process results and spawn new tasks (with task panic isolation)
|
||||||
|
while let Some(task_result) = tasks.next().await {
|
||||||
|
// Check for shutdown
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown signal received, stopping processing").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
match task_result {
|
||||||
|
Ok(Ok(_)) => {
|
||||||
|
// Success - spawn next task
|
||||||
|
if let Some(company) = pending.pop() {
|
||||||
|
spawn_validation_task(
|
||||||
|
company,
|
||||||
|
&yahoo_pool,
|
||||||
|
&paths,
|
||||||
|
&write_tx,
|
||||||
|
shutdown_flag,
|
||||||
|
&processed,
|
||||||
|
&valid_count,
|
||||||
|
&filtered_low_cap,
|
||||||
|
&filtered_no_price,
|
||||||
|
&failed_count,
|
||||||
|
total,
|
||||||
|
&mut tasks,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
// Processing error
|
||||||
|
logger::log_error(&format!("Company processing error: {}", e)).await;
|
||||||
|
|
||||||
|
if let Some(company) = pending.pop() {
|
||||||
|
spawn_validation_task(
|
||||||
|
company,
|
||||||
|
&yahoo_pool,
|
||||||
|
&paths,
|
||||||
|
&write_tx,
|
||||||
|
shutdown_flag,
|
||||||
|
&processed,
|
||||||
|
&valid_count,
|
||||||
|
&filtered_low_cap,
|
||||||
|
&filtered_no_price,
|
||||||
|
&failed_count,
|
||||||
|
total,
|
||||||
|
&mut tasks,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Task panic (isolated - doesn't crash entire process)
|
||||||
|
logger::log_error(&format!("Task panic: {}", e)).await;
|
||||||
|
|
||||||
|
if let Some(company) = pending.pop() {
|
||||||
|
spawn_validation_task(
|
||||||
|
company,
|
||||||
|
&yahoo_pool,
|
||||||
|
&paths,
|
||||||
|
&write_tx,
|
||||||
|
shutdown_flag,
|
||||||
|
&processed,
|
||||||
|
&valid_count,
|
||||||
|
&filtered_low_cap,
|
||||||
|
&filtered_no_price,
|
||||||
|
&failed_count,
|
||||||
|
total,
|
||||||
|
&mut tasks,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("Main processing loop completed").await;
|
||||||
|
|
||||||
|
// Signal writer to finish
|
||||||
|
let _ = write_tx.send(LogCommand::Checkpoint).await;
|
||||||
|
let _ = write_tx.send(LogCommand::Shutdown).await;
|
||||||
|
drop(write_tx);
|
||||||
|
|
||||||
|
// Wait for writer to finish
|
||||||
|
let (final_count, final_new, final_updated) = writer_task.await
|
||||||
|
.unwrap_or((0, 0, 0));
|
||||||
|
|
||||||
|
let final_valid = valid_count.load(Ordering::SeqCst);
|
||||||
|
let final_filtered_low_cap = filtered_low_cap.load(Ordering::SeqCst);
|
||||||
|
let final_filtered_no_price = filtered_no_price.load(Ordering::SeqCst);
|
||||||
|
let final_failed = failed_count.load(Ordering::SeqCst);
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✅ Completed: {} total companies ({} new, {} updated)",
|
||||||
|
final_count, final_new, final_updated
|
||||||
|
)).await;
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" Valid: {}, Filtered (low cap): {}, Filtered (no price): {}, Failed: {}",
|
||||||
|
final_valid, final_filtered_low_cap, final_filtered_no_price, final_failed
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// === VERIFY AND RECREATE FINAL OUTPUT ===
|
||||||
|
logger::log_info("Verifying final output integrity...").await;
|
||||||
|
|
||||||
|
let final_companies_map = existing_companies_writer.lock().await;
|
||||||
|
let expected_count = final_companies_map.len();
|
||||||
|
|
||||||
|
// Always write final consolidated checkpoint
|
||||||
|
let temp_checkpoint = checkpoint_path.with_extension("tmp");
|
||||||
|
let mut temp_file = File::create(&temp_checkpoint).await?;
|
||||||
|
|
||||||
|
for company in final_companies_map.values() {
|
||||||
|
let json_line = serde_json::to_string(company)?;
|
||||||
|
temp_file.write_all(json_line.as_bytes()).await?;
|
||||||
|
temp_file.write_all(b"\n").await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
temp_file.flush().await?;
|
||||||
|
temp_file.sync_data().await?;
|
||||||
|
drop(temp_file);
|
||||||
|
|
||||||
|
tokio::fs::rename(&temp_checkpoint, &checkpoint_path).await?;
|
||||||
|
drop(final_companies_map);
|
||||||
|
|
||||||
|
// Clear log since everything is in checkpoint
|
||||||
|
if log_path.exists() {
|
||||||
|
tokio::fs::remove_file(&log_path).await.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("✓ Final output: {} companies in {:?}", expected_count, checkpoint_path)).await;
|
||||||
|
|
||||||
|
// Shutdown Yahoo pool
|
||||||
|
yahoo_pool.shutdown().await?;
|
||||||
|
|
||||||
|
// Track completion with:
|
||||||
|
// - Content reference: All event directories
|
||||||
|
// - Data stage: Data (7-day TTL by default)
|
||||||
|
// - Dependencies: Depends on cleaned companies data
|
||||||
|
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
manager.mark_valid(entry).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(final_count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to spawn a validation task (reduces code duplication)
|
||||||
|
fn spawn_validation_task(
|
||||||
|
company: CompanyData,
|
||||||
|
yahoo_pool: &Arc<YahooClientPool>,
|
||||||
|
paths: &Arc<DataPaths>,
|
||||||
|
write_tx: &mpsc::Sender<LogCommand>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
processed: &Arc<AtomicUsize>,
|
||||||
|
valid_count: &Arc<AtomicUsize>,
|
||||||
|
filtered_low_cap: &Arc<AtomicUsize>,
|
||||||
|
filtered_no_price: &Arc<AtomicUsize>,
|
||||||
|
failed_count: &Arc<AtomicUsize>,
|
||||||
|
total: usize,
|
||||||
|
tasks: &mut FuturesUnordered<tokio::task::JoinHandle<anyhow::Result<Option<()>>>>,
|
||||||
|
) {
|
||||||
|
let yahoo_pool_clone = Arc::clone(yahoo_pool);
|
||||||
|
let paths_clone = Arc::clone(paths);
|
||||||
|
let shutdown_flag_clone = Arc::clone(shutdown_flag);
|
||||||
|
let write_tx_clone = write_tx.clone();
|
||||||
|
let processed_clone = Arc::clone(processed);
|
||||||
|
let valid_count_clone = Arc::clone(valid_count);
|
||||||
|
let filtered_low_cap_clone = Arc::clone(filtered_low_cap);
|
||||||
|
let filtered_no_price_clone = Arc::clone(filtered_no_price);
|
||||||
|
let failed_count_clone = Arc::clone(failed_count);
|
||||||
|
|
||||||
|
let task = tokio::spawn(async move {
|
||||||
|
// Check shutdown at start
|
||||||
|
if shutdown_flag_clone.load(Ordering::SeqCst) {
|
||||||
|
return Ok::<_, anyhow::Error>(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = process_company_with_validation(
|
||||||
|
&company,
|
||||||
|
&yahoo_pool_clone,
|
||||||
|
&*paths_clone,
|
||||||
|
).await;
|
||||||
|
|
||||||
|
match result {
|
||||||
|
CompanyProcessResult::Valid(validated_company) => {
|
||||||
|
// Send to writer
|
||||||
|
let _ = write_tx_clone.send(LogCommand::Write(validated_company)).await;
|
||||||
|
valid_count_clone.fetch_add(1, Ordering::SeqCst);
|
||||||
|
}
|
||||||
|
CompanyProcessResult::FilteredLowCap { name, market_cap } => {
|
||||||
|
filtered_low_cap_clone.fetch_add(1, Ordering::SeqCst);
|
||||||
|
if filtered_low_cap_clone.load(Ordering::SeqCst) <= 10 {
|
||||||
|
logger::log_info(&format!(" Filtered {} - low market cap: {:.0} EUR", name, market_cap)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CompanyProcessResult::FilteredNoPrice { name } => {
|
||||||
|
filtered_no_price_clone.fetch_add(1, Ordering::SeqCst);
|
||||||
|
if filtered_no_price_clone.load(Ordering::SeqCst) <= 10 {
|
||||||
|
logger::log_info(&format!(" Filtered {} - no recent price data", name)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CompanyProcessResult::Failed { company: failed_company, error, is_transient: _ } => {
|
||||||
|
failed_count_clone.fetch_add(1, Ordering::SeqCst);
|
||||||
|
logger::log_warn(&format!(" Failed to process '{}': {}", failed_company.name, error)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Progress reporting
|
||||||
|
let current = processed_clone.fetch_add(1, Ordering::SeqCst) + 1;
|
||||||
|
if current % 100 == 0 {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Progress: {}/{} ({} valid, {} low cap, {} no price, {} failed)",
|
||||||
|
current, total,
|
||||||
|
valid_count_clone.load(Ordering::SeqCst),
|
||||||
|
filtered_low_cap_clone.load(Ordering::SeqCst),
|
||||||
|
filtered_no_price_clone.load(Ordering::SeqCst),
|
||||||
|
failed_count_clone.load(Ordering::SeqCst)
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(None::<()>)
|
||||||
|
});
|
||||||
|
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process a single company with full error categorization
|
||||||
|
async fn process_company_with_validation(
|
||||||
|
company: &CompanyData,
|
||||||
|
yahoo_pool: &Arc<YahooClientPool>,
|
||||||
|
paths: &DataPaths,
|
||||||
|
) -> CompanyProcessResult {
|
||||||
|
// Extract Yahoo ticker
|
||||||
|
let ticker = match extract_first_yahoo_ticker(company) {
|
||||||
|
Some(t) => t,
|
||||||
|
None => {
|
||||||
|
return CompanyProcessResult::Failed {
|
||||||
|
company: company.clone(),
|
||||||
|
error: "No valid Yahoo ticker found".to_string(),
|
||||||
|
is_transient: false, // Permanent - no ticker means no data
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fetch core modules from Yahoo
|
||||||
|
let summary = match yahoo_pool.get_quote_summary(
|
||||||
|
&ticker,
|
||||||
|
&QuoteSummaryModule::core_modules(),
|
||||||
|
).await {
|
||||||
|
Ok(s) => s,
|
||||||
|
Err(e) => {
|
||||||
|
let error_msg = e.to_string();
|
||||||
|
let is_transient = is_transient_error(&error_msg);
|
||||||
|
|
||||||
|
return CompanyProcessResult::Failed {
|
||||||
|
company: company.clone(),
|
||||||
|
error: format!("API error fetching summary: {}", error_msg),
|
||||||
|
is_transient,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Validate market cap
|
||||||
|
let market_cap = extract_market_cap(&summary);
|
||||||
|
if market_cap < 100_000_000.0 {
|
||||||
|
return CompanyProcessResult::FilteredLowCap {
|
||||||
|
name: company.name.clone(),
|
||||||
|
market_cap,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate recent price activity
|
||||||
|
let has_recent_price = match check_recent_price_activity(yahoo_pool, &ticker).await {
|
||||||
|
Ok(has) => has,
|
||||||
|
Err(e) => {
|
||||||
|
let error_msg = e.to_string();
|
||||||
|
let is_transient = is_transient_error(&error_msg);
|
||||||
|
|
||||||
|
return CompanyProcessResult::Failed {
|
||||||
|
company: company.clone(),
|
||||||
|
error: format!("API error fetching price history: {}", error_msg),
|
||||||
|
is_transient,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if !has_recent_price {
|
||||||
|
return CompanyProcessResult::FilteredNoPrice {
|
||||||
|
name: company.name.clone(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save core data
|
||||||
|
if let Err(e) = save_company_core_data(paths, &company.name, &summary).await {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
" Failed to save core data for {}: {}",
|
||||||
|
company.name, e
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
CompanyProcessResult::Valid(company.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determine if an error is transient (should retry) or permanent (skip)
|
||||||
|
fn is_transient_error(error: &str) -> bool {
|
||||||
|
let error_lower = error.to_lowercase();
|
||||||
|
|
||||||
|
// Transient errors (network, rate limiting, timeouts)
|
||||||
|
let transient_patterns = [
|
||||||
|
"timeout",
|
||||||
|
"timed out",
|
||||||
|
"connection",
|
||||||
|
"network",
|
||||||
|
"rate limit",
|
||||||
|
"too many requests",
|
||||||
|
"429",
|
||||||
|
"503",
|
||||||
|
"502",
|
||||||
|
"500",
|
||||||
|
"temporarily",
|
||||||
|
"unavailable",
|
||||||
|
];
|
||||||
|
|
||||||
|
for pattern in &transient_patterns {
|
||||||
|
if error_lower.contains(pattern) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Permanent errors (invalid ticker, no data, parsing errors)
|
||||||
|
let permanent_patterns = [
|
||||||
|
"404",
|
||||||
|
"not found",
|
||||||
|
"invalid",
|
||||||
|
"no data",
|
||||||
|
"parse error",
|
||||||
|
"400",
|
||||||
|
"401",
|
||||||
|
"403",
|
||||||
|
];
|
||||||
|
|
||||||
|
for pattern in &permanent_patterns {
|
||||||
|
if error_lower.contains(pattern) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default: treat unknown errors as transient (safer to retry)
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_market_cap(summary: &crate::scraper::yahoo::QuoteSummary) -> f64 {
|
||||||
|
let price_module = match summary.modules.get("price") {
|
||||||
|
Some(m) => m,
|
||||||
|
None => return 0.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
let market_cap_raw = price_module
|
||||||
|
.get("marketCap")
|
||||||
|
.and_then(|v| v.get("raw"))
|
||||||
|
.and_then(|v| v.as_f64())
|
||||||
|
.unwrap_or(0.0);
|
||||||
|
|
||||||
|
let currency = price_module
|
||||||
|
.get("currency")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("USD");
|
||||||
|
|
||||||
|
let market_cap_eur = match currency {
|
||||||
|
"EUR" => market_cap_raw,
|
||||||
|
"USD" => market_cap_raw * 0.92,
|
||||||
|
"GBP" => market_cap_raw * 1.17,
|
||||||
|
"JPY" => market_cap_raw * 0.0061,
|
||||||
|
"CHF" => market_cap_raw * 1.05,
|
||||||
|
_ => market_cap_raw * 0.92,
|
||||||
|
};
|
||||||
|
|
||||||
|
market_cap_eur
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn check_recent_price_activity(
|
||||||
|
yahoo_pool: &Arc<YahooClientPool>,
|
||||||
|
ticker: &str,
|
||||||
|
) -> anyhow::Result<bool> {
|
||||||
|
let now = Utc::now().timestamp();
|
||||||
|
let one_year_ago = now - (365 * 24 * 60 * 60);
|
||||||
|
let sixty_days_ago = now - (60 * 24 * 60 * 60);
|
||||||
|
|
||||||
|
let chart_data = yahoo_pool.get_chart_data(
|
||||||
|
ticker,
|
||||||
|
"1d",
|
||||||
|
sixty_days_ago,
|
||||||
|
now,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
if chart_data.quotes.is_empty() {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let most_recent_timestamp = chart_data.quotes
|
||||||
|
.iter()
|
||||||
|
.map(|q| q.timestamp)
|
||||||
|
.max()
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
Ok(most_recent_timestamp >= one_year_ago)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn save_company_core_data(
|
||||||
|
paths: &DataPaths,
|
||||||
|
company_name: &str,
|
||||||
|
summary: &crate::scraper::yahoo::QuoteSummary,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
use tokio::fs;
|
||||||
|
|
||||||
|
let safe_name = sanitize_company_name(company_name);
|
||||||
|
|
||||||
|
let company_dir = paths.corporate_dir().join(&safe_name).join("core");
|
||||||
|
fs::create_dir_all(&company_dir).await?;
|
||||||
|
|
||||||
|
let data_path = company_dir.join("data.jsonl");
|
||||||
|
let json_line = serde_json::to_string(summary)?;
|
||||||
|
|
||||||
|
let mut file = fs::File::create(&data_path).await?;
|
||||||
|
file.write_all(json_line.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Check if a company needs processing (validation check)
|
||||||
|
fn company_needs_processing(
|
||||||
|
company: &CompanyData,
|
||||||
|
existing_companies: &HashMap<String, CompanyData>,
|
||||||
|
) -> bool {
|
||||||
|
// If company exists in cleaned output, skip it
|
||||||
|
!existing_companies.contains_key(&company.name)
|
||||||
|
}
|
||||||
1070
src/corporate/update_companies_enrich.rs
Normal file
1070
src/corporate/update_companies_enrich.rs
Normal file
File diff suppressed because it is too large
Load Diff
1606
src/corporate/update_openfigi.rs
Normal file
1606
src/corporate/update_openfigi.rs
Normal file
File diff suppressed because it is too large
Load Diff
229
src/corporate/yahoo_company_extraction.js
Normal file
229
src/corporate/yahoo_company_extraction.js
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
// yahoo_company_extraction.js
|
||||||
|
// JavaScript extraction script for Yahoo Finance company details
|
||||||
|
// Used to extract ticker, sector, and exchange from Yahoo Finance search results
|
||||||
|
// Only ticker is mandatory - sector and exchange are optional fields
|
||||||
|
|
||||||
|
// Example selectors:
|
||||||
|
// with results:
|
||||||
|
// document.querySelector("#main-content-wrapper > section > section.container.yf-1omxedn > div.tableContainer.yf-1omxedn > div > table")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(1) > span > div > a")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(2) > span > div")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(3) > span > div")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(4) > span > div > a")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(5) > span > div")
|
||||||
|
// document.querySelector("#\\30 > td:nth-child(6) > span > div")
|
||||||
|
// row with no result:
|
||||||
|
// document.querySelector("#\\32 > td:nth-child(4) > span > p")
|
||||||
|
// no results:
|
||||||
|
// document.querySelector("#main-content-wrapper > section > div.noData.yf-1omxedn")
|
||||||
|
|
||||||
|
// Using a wrapper to ensure the result is properly captured
|
||||||
|
var extractionResult = (function() {
|
||||||
|
try {
|
||||||
|
// Check for "No results found" message using very flexible selector
|
||||||
|
const noDataElement = document.querySelector('[class*="noData"]') ||
|
||||||
|
document.querySelector('[class*="error"]') ||
|
||||||
|
(document.body.innerText && document.body.innerText.includes('No results'));
|
||||||
|
if (noDataElement) {
|
||||||
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the results table using most flexible selector possible
|
||||||
|
// Try multiple strategies to find the table
|
||||||
|
const table = document.querySelector('table') ||
|
||||||
|
document.querySelector('[role="table"]') ||
|
||||||
|
document.querySelector('.table') ||
|
||||||
|
document.querySelector('#main-content-wrapper > section > section[class*="container"] > div[class*="tableContainer"] > div > table');
|
||||||
|
if (!table) {
|
||||||
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find all rows in tbody
|
||||||
|
const allRows = table.querySelectorAll('tbody tr');
|
||||||
|
if (!allRows || allRows.length === 0) {
|
||||||
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to safely extract text content
|
||||||
|
function extractText(element) {
|
||||||
|
if (!element) return '';
|
||||||
|
const text = element.textContent.trim();
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to check if a cell actually contains data
|
||||||
|
// Multiple indicators are used to determine if data is present
|
||||||
|
function hasValidData(cellElement) {
|
||||||
|
if (!cellElement) return false;
|
||||||
|
|
||||||
|
// Indicator 1: Check if the cell contains a <p> tag (Yahoo uses this for "no data")
|
||||||
|
const pTag = cellElement.querySelector('p');
|
||||||
|
if (pTag) return false;
|
||||||
|
|
||||||
|
// Indicator 2: Check the direct child structure
|
||||||
|
// Valid data cells have: td > span > div or td > span > div > a
|
||||||
|
// Invalid data cells have: td > span > p
|
||||||
|
const span = cellElement.querySelector('span');
|
||||||
|
if (span) {
|
||||||
|
const directChildren = Array.from(span.children);
|
||||||
|
// If the only or first child is a <p>, it's likely "no data"
|
||||||
|
if (directChildren.length > 0 && directChildren[0].tagName === 'P') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indicator 3: Check text content
|
||||||
|
const text = extractText(cellElement);
|
||||||
|
if (!text) return false;
|
||||||
|
const normalized = text.toLowerCase().trim();
|
||||||
|
|
||||||
|
// Common "no data" indicators
|
||||||
|
const noDataIndicators = [
|
||||||
|
'-',
|
||||||
|
'n/a',
|
||||||
|
'na',
|
||||||
|
'none',
|
||||||
|
'not available',
|
||||||
|
'no data',
|
||||||
|
'--',
|
||||||
|
'—', // em dash
|
||||||
|
'–', // en dash
|
||||||
|
];
|
||||||
|
|
||||||
|
if (noDataIndicators.includes(normalized)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indicator 4: Check for common CSS classes that indicate empty state
|
||||||
|
const classIndicators = ['empty', 'no-data', 'na', 'null', 'undefined'];
|
||||||
|
const classList = cellElement.className || '';
|
||||||
|
for (const indicator of classIndicators) {
|
||||||
|
if (classList.includes(indicator)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indicator 5: Check if cell has an anchor tag (usually indicates real data)
|
||||||
|
const hasLink = cellElement.querySelector('a') !== null;
|
||||||
|
|
||||||
|
// Indicator 6: Check if there's actual substantial content
|
||||||
|
// If text is very short (1-2 chars) and not alphanumeric, it's likely not real data
|
||||||
|
if (text.length <= 2 && !/[a-zA-Z0-9]/.test(text)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we passed all checks, consider it valid data
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to extract and normalize data from a cell
|
||||||
|
function extractCellData(cellElement) {
|
||||||
|
if (!cellElement) return null;
|
||||||
|
if (!hasValidData(cellElement)) return null;
|
||||||
|
|
||||||
|
const text = extractText(cellElement);
|
||||||
|
return text || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to extract and normalize data from a row
|
||||||
|
function extractRowData(row) {
|
||||||
|
// Extract ticker from column 1 (td:nth-child(1))
|
||||||
|
const tickerCell = row.querySelector('td:nth-child(1)');
|
||||||
|
const ticker = extractCellData(tickerCell);
|
||||||
|
|
||||||
|
// Extract sector from column 4 (td:nth-child(4))
|
||||||
|
const sectorCell = row.querySelector('td:nth-child(4)');
|
||||||
|
const sector = extractCellData(sectorCell);
|
||||||
|
|
||||||
|
// Extract exchange from column 6 (td:nth-child(6))
|
||||||
|
const exchangeCell = row.querySelector('td:nth-child(6)');
|
||||||
|
const exchange = extractCellData(exchangeCell);
|
||||||
|
|
||||||
|
return { ticker, sector, exchange };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to count non-null fields (data completeness counter)
|
||||||
|
function countValidFields(data) {
|
||||||
|
let count = 0;
|
||||||
|
if (data.ticker) count++;
|
||||||
|
if (data.sector) count++;
|
||||||
|
if (data.exchange) count++;
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to score a row (prioritize rows with more complete data)
|
||||||
|
function scoreRow(data) {
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
// Ticker is mandatory and gets highest weight
|
||||||
|
if (data.ticker) score += 100;
|
||||||
|
|
||||||
|
// Sector and exchange are nice-to-have
|
||||||
|
if (data.sector) score += 10;
|
||||||
|
if (data.exchange) score += 10;
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract data from all rows and find the one with most complete data
|
||||||
|
let bestRow = null;
|
||||||
|
let maxScore = -1;
|
||||||
|
let rowIndex = 0;
|
||||||
|
|
||||||
|
for (const row of allRows) {
|
||||||
|
const data = extractRowData(row);
|
||||||
|
const score = scoreRow(data);
|
||||||
|
|
||||||
|
// Select row with highest score (most complete data)
|
||||||
|
// If tied, first row wins
|
||||||
|
if (score > maxScore) {
|
||||||
|
bestRow = data;
|
||||||
|
maxScore = score;
|
||||||
|
bestRow.rowIndex = rowIndex;
|
||||||
|
bestRow.validFieldCount = countValidFields(data);
|
||||||
|
bestRow.score = score;
|
||||||
|
}
|
||||||
|
|
||||||
|
rowIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ticker is mandatory - return error status if not found
|
||||||
|
if (!bestRow || !bestRow.ticker) {
|
||||||
|
return {
|
||||||
|
status: 'error',
|
||||||
|
error_message: 'No ticker found in any row',
|
||||||
|
ticker: null,
|
||||||
|
sector: null,
|
||||||
|
exchange: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return success with ticker (mandatory) and optional sector/exchange
|
||||||
|
// Include metadata about which row was selected and how many valid fields it had
|
||||||
|
return {
|
||||||
|
status: 'found',
|
||||||
|
ticker: bestRow.ticker,
|
||||||
|
sector: bestRow.sector,
|
||||||
|
exchange: bestRow.exchange,
|
||||||
|
metadata: {
|
||||||
|
selectedRowIndex: bestRow.rowIndex,
|
||||||
|
validFieldCount: bestRow.validFieldCount,
|
||||||
|
score: bestRow.score,
|
||||||
|
totalRows: allRows.length
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
// Only catch unexpected errors during extraction
|
||||||
|
return {
|
||||||
|
status: 'error',
|
||||||
|
error_message: error.toString(),
|
||||||
|
ticker: null,
|
||||||
|
sector: null,
|
||||||
|
exchange: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
// Return the result explicitly
|
||||||
|
return extractionResult;
|
||||||
468
src/corporate/yahoo_company_extraction.rs
Normal file
468
src/corporate/yahoo_company_extraction.rs
Normal file
@@ -0,0 +1,468 @@
|
|||||||
|
// src/corporate/yahoo.rs
|
||||||
|
use super::{types::*, helpers::*, page_validation::*};
|
||||||
|
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
||||||
|
use crate::logger;
|
||||||
|
use fantoccini::{Client, Locator};
|
||||||
|
use rand::Rng;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use tokio::time::{Duration as TokioDuration, sleep, timeout};
|
||||||
|
use std::{sync::Arc, sync::atomic::{AtomicBool, Ordering}};
|
||||||
|
use anyhow::{anyhow, Result};
|
||||||
|
|
||||||
|
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub enum YahooTickerResult {
|
||||||
|
Found(String),
|
||||||
|
NotFound,
|
||||||
|
NoResults,
|
||||||
|
AmbiguousResults,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ExtractionMetadata {
|
||||||
|
#[serde(rename = "selectedRowIndex")]
|
||||||
|
pub selected_row_index: usize,
|
||||||
|
#[serde(rename = "validFieldCount")]
|
||||||
|
pub valid_field_count: usize,
|
||||||
|
#[serde(rename = "totalRows")]
|
||||||
|
pub total_rows: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ExtractionResult {
|
||||||
|
status: String,
|
||||||
|
ticker: Option<String>,
|
||||||
|
sector: Option<String>,
|
||||||
|
exchange: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
error_message: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
metadata: Option<ExtractionMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl YahooTickerResult {
|
||||||
|
pub fn to_tagged_string(&self) -> String {
|
||||||
|
match self {
|
||||||
|
YahooTickerResult::Found(ticker) => format!("YAHOO:{}", ticker),
|
||||||
|
YahooTickerResult::NotFound => "YAHOO:NOT_FOUND".to_string(),
|
||||||
|
YahooTickerResult::NoResults => "YAHOO:NO_RESULTS".to_string(),
|
||||||
|
YahooTickerResult::AmbiguousResults => "YAHOO:AMBIGUOUS".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_found(&self) -> bool {
|
||||||
|
matches!(self, YahooTickerResult::Found(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_ticker(&self) -> Option<&str> {
|
||||||
|
match self {
|
||||||
|
YahooTickerResult::Found(ticker) => Some(ticker),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scrape company details with full validation and shutdown support
|
||||||
|
pub async fn scrape_company_details_by_isin(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
isin: &str,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<Option<YahooCompanyData>> {
|
||||||
|
// Check shutdown before starting
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn(&format!("Shutdown detected, skipping ISIN: {}", isin)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
if pool.should_perform_hard_reset() {
|
||||||
|
logger::log_warn("HARD_RESET_REQUIRED detected before starting ISIN scrape").await;
|
||||||
|
return Err(anyhow!("HARD_RESET_REQUIRED"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let isin_owned = isin.to_string();
|
||||||
|
let shutdown_clone = Arc::clone(shutdown_flag);
|
||||||
|
let url = format!("https://finance.yahoo.com/lookup/?s={}", isin);
|
||||||
|
|
||||||
|
pool.execute(url.clone(), move |client| {
|
||||||
|
let isin = isin_owned.clone();
|
||||||
|
let shutdown = shutdown_clone.clone();
|
||||||
|
|
||||||
|
Box::pin(async move {
|
||||||
|
// Check shutdown during task execution
|
||||||
|
if shutdown.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Task aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Random delay
|
||||||
|
let delay = rand::rng().random_range(800..1500);
|
||||||
|
sleep(TokioDuration::from_millis(delay)).await;
|
||||||
|
|
||||||
|
// Reject cookies
|
||||||
|
reject_yahoo_cookies(&client).await?;
|
||||||
|
|
||||||
|
// Check shutdown again
|
||||||
|
if shutdown.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Task aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// CRITICAL: Validate navigation succeeded
|
||||||
|
let expected_fragment = format!("lookup/?s={}", isin);
|
||||||
|
match verify_navigation(&client, &expected_fragment, 5).await {
|
||||||
|
Ok(_) => {
|
||||||
|
logger::log_info(&format!("✓ Navigation validated for ISIN: {}", isin)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"Navigation verification failed for ISIN {}: {}",
|
||||||
|
isin, e
|
||||||
|
)).await;
|
||||||
|
// Clear browser state before returning error
|
||||||
|
clear_browser_state(&client).await.ok();
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Additional content validation - look for table or noData element anywhere on page
|
||||||
|
let page_ready: bool = client
|
||||||
|
.execute(
|
||||||
|
r#"
|
||||||
|
// Try multiple selector strategies
|
||||||
|
const table = document.querySelector('table') ||
|
||||||
|
document.querySelector('[role="table"]') ||
|
||||||
|
document.querySelector('.table');
|
||||||
|
const noData = document.querySelector('[class*="noData"]') ||
|
||||||
|
document.querySelector('[class*="error"]') ||
|
||||||
|
document.body.innerText.includes('No results');
|
||||||
|
const hasContent = !!(table || noData);
|
||||||
|
console.log('Page ready check - table:', !!table, 'noData:', !!noData, 'hasContent:', hasContent);
|
||||||
|
return hasContent;
|
||||||
|
"#,
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if !page_ready {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"Page content not ready for ISIN {} - neither table nor no-data element found",
|
||||||
|
isin
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(&client).await.ok();
|
||||||
|
return Err(anyhow!("Page content not ready"));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("✓ Page content validated for ISIN: {}", isin)).await;
|
||||||
|
|
||||||
|
// Check shutdown before extraction
|
||||||
|
if shutdown.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow!("Task aborted due to shutdown"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Random delay before extraction
|
||||||
|
let delay = rand::rng().random_range(800..1500);
|
||||||
|
sleep(TokioDuration::from_millis(delay)).await;
|
||||||
|
|
||||||
|
// Now safe to extract
|
||||||
|
extract_company_details_validated(&client, &isin).await
|
||||||
|
})
|
||||||
|
}).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// UPDATED: Extract with additional URL validation
|
||||||
|
async fn extract_company_details_validated(
|
||||||
|
client: &Client,
|
||||||
|
isin: &str,
|
||||||
|
) -> Result<Option<YahooCompanyData>> {
|
||||||
|
// Double-check URL is still correct before extraction
|
||||||
|
let current_url = client.current_url().await?;
|
||||||
|
if !current_url.as_str().contains(isin) {
|
||||||
|
logger::log_error(&format!(
|
||||||
|
"URL mismatch before extraction: expected ISIN '{}' in URL, got '{}'",
|
||||||
|
isin,
|
||||||
|
current_url.as_str()
|
||||||
|
)).await;
|
||||||
|
clear_browser_state(client).await.ok();
|
||||||
|
return Err(anyhow!("URL mismatch - possible stale page"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run extraction
|
||||||
|
let result = extract_company_details(client, isin).await?;
|
||||||
|
|
||||||
|
// Validate extraction result
|
||||||
|
if let Some(ref details) = result {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"✓ Extracted ticker '{}' for ISIN {} (sector: {:?}, exchange: {:?})",
|
||||||
|
details.ticker, isin, details.sector, details.exchange
|
||||||
|
)).await;
|
||||||
|
} else {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"No ticker found for ISIN {} (legitimately not found)",
|
||||||
|
isin
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract_company_details(
|
||||||
|
client: &Client,
|
||||||
|
_isin: &str,
|
||||||
|
) -> Result<Option<YahooCompanyData>> {
|
||||||
|
// Wait for page to load - look for either the table or the no-data element using simple selectors
|
||||||
|
let wait_result: Result<Result<bool, anyhow::Error>> = timeout(
|
||||||
|
TokioDuration::from_secs(30),
|
||||||
|
async {
|
||||||
|
for _ in 0..60 {
|
||||||
|
let has_content: bool = client
|
||||||
|
.execute(
|
||||||
|
r#"
|
||||||
|
// Use flexible selectors that don't depend on exact DOM structure
|
||||||
|
const table = document.querySelector('table') ||
|
||||||
|
document.querySelector('[role="table"]') ||
|
||||||
|
document.querySelector('.table');
|
||||||
|
const noData = document.querySelector('[class*="noData"]') ||
|
||||||
|
document.querySelector('[class*="error"]');
|
||||||
|
const hasContent = !!(table || noData);
|
||||||
|
return hasContent;
|
||||||
|
"#,
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Execute error: {}", e))?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if has_content {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(TokioDuration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
Ok(false)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|_| anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
||||||
|
|
||||||
|
match wait_result {
|
||||||
|
Err(_) => {
|
||||||
|
return Err(anyhow!("Timeout waiting for Yahoo Finance page to load"));
|
||||||
|
},
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
return Err(anyhow!("Error checking page content: {}", e));
|
||||||
|
},
|
||||||
|
Ok(Ok(false)) => {
|
||||||
|
logger::log_warn("Page content not found after waiting, attempting extraction anyway").await;
|
||||||
|
},
|
||||||
|
Ok(Ok(true)) => {
|
||||||
|
logger::log_info("Page content detected, proceeding with extraction").await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute the JavaScript extraction script
|
||||||
|
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
|
||||||
|
|
||||||
|
// Log the raw result for debugging
|
||||||
|
logger::log_info(&format!("JavaScript extraction raw result: {:?}", result)).await;
|
||||||
|
|
||||||
|
// Check if result is null
|
||||||
|
if result.is_null() {
|
||||||
|
return Err(anyhow!("JavaScript returned null - page may not be fully loaded or script failed"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse the JSON result
|
||||||
|
let extraction: ExtractionResult = serde_json::from_value(result.clone())
|
||||||
|
.map_err(|e| {
|
||||||
|
let result_str = serde_json::to_string_pretty(&result).unwrap_or_else(|_| format!("{:?}", result));
|
||||||
|
anyhow!("Failed to parse extraction result: {}. Raw result: {}", e, result_str)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
match extraction.status.as_str() {
|
||||||
|
"found" => {
|
||||||
|
if let Some(ticker) = extraction.ticker {
|
||||||
|
if let Some(ref metadata) = extraction.metadata {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Selected row {} with {} valid fields out of {} total rows",
|
||||||
|
metadata.selected_row_index,
|
||||||
|
metadata.valid_field_count,
|
||||||
|
metadata.total_rows
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(YahooCompanyData {
|
||||||
|
ticker,
|
||||||
|
sector: extraction.sector,
|
||||||
|
exchange: extraction.exchange,
|
||||||
|
}))
|
||||||
|
} else {
|
||||||
|
Err(anyhow!("Status 'found' but no ticker present"))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"no_results" => Ok(None),
|
||||||
|
"error" => {
|
||||||
|
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
|
||||||
|
Err(anyhow!("JavaScript extraction error: {}", error_msg))
|
||||||
|
},
|
||||||
|
_ => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::Result<Vec<String>> {
|
||||||
|
let corporate_path = paths.data_dir().join("corporate").join("by_name");
|
||||||
|
let companies_file = corporate_path.join("companies.jsonl");
|
||||||
|
let content = tokio::fs::read_to_string(companies_file).await?;
|
||||||
|
let mut tickers = Vec::new();
|
||||||
|
for line in content.lines() {
|
||||||
|
let company: CompanyData = serde_json::from_str(line)?;
|
||||||
|
if let Some(isin_tickers_map) = company.isin_tickers_map {
|
||||||
|
for (_isin, ticker_vec) in isin_tickers_map {
|
||||||
|
tickers.extend(ticker_vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(tickers)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn fetch_earnings_with_pool(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
ticker: &str,
|
||||||
|
) -> anyhow::Result<Vec<CompanyEventData>> {
|
||||||
|
let ticker = ticker.to_string();
|
||||||
|
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker);
|
||||||
|
|
||||||
|
let ticker_cloned = ticker.clone();
|
||||||
|
|
||||||
|
pool.execute(url, move |client| {
|
||||||
|
let ticker = ticker_cloned.clone();
|
||||||
|
Box::pin(async move {
|
||||||
|
reject_yahoo_cookies(&client).await?;
|
||||||
|
extract_earnings_events(&client, &ticker).await
|
||||||
|
})
|
||||||
|
}).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEventData>> {
|
||||||
|
// Wait for the table to load
|
||||||
|
let table = client
|
||||||
|
.wait()
|
||||||
|
.for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
|
||||||
|
|
||||||
|
// Find all rows in tbody
|
||||||
|
let rows = table
|
||||||
|
.find_all(Locator::Css("tbody tr"))
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
|
||||||
|
|
||||||
|
let mut events = Vec::with_capacity(rows.len());
|
||||||
|
|
||||||
|
for row in rows {
|
||||||
|
let cells = row
|
||||||
|
.find_all(Locator::Css("td"))
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
|
||||||
|
|
||||||
|
if cells.len() < 5 {
|
||||||
|
continue; // Skip incomplete rows
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract and parse date
|
||||||
|
let date_str = cells[0]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get date text: {}", e))?;
|
||||||
|
let date = parse_yahoo_date(&date_str)
|
||||||
|
.map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
|
||||||
|
.format("%Y-%m-%d")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
// Extract time, replace "Time Not Supplied" with empty
|
||||||
|
let time = cells[1]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get time text: {}", e))?
|
||||||
|
.replace("Time Not Supplied", "");
|
||||||
|
|
||||||
|
// Extract period
|
||||||
|
let period = cells[2]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get period text: {}", e))?;
|
||||||
|
|
||||||
|
// Parse EPS forecast
|
||||||
|
let eps_forecast_str = cells[3]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
|
||||||
|
let eps_forecast = parse_float(&eps_forecast_str);
|
||||||
|
|
||||||
|
// Parse EPS actual
|
||||||
|
let eps_actual_str = cells[4]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
|
||||||
|
let eps_actual = parse_float(&eps_actual_str);
|
||||||
|
|
||||||
|
// Parse surprise % if available
|
||||||
|
let surprise_pct = if cells.len() > 5 {
|
||||||
|
let surprise_str = cells[5]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
|
||||||
|
parse_float(&surprise_str)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
events.push(CompanyEventData {
|
||||||
|
ticker: ticker.to_string(),
|
||||||
|
date,
|
||||||
|
time,
|
||||||
|
period,
|
||||||
|
eps_forecast,
|
||||||
|
eps_actual,
|
||||||
|
revenue_forecast: None,
|
||||||
|
revenue_actual: None,
|
||||||
|
surprise_pct,
|
||||||
|
source: "Yahoo".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if events.is_empty() {
|
||||||
|
logger::log_warn(&format!("Warning: No earnings events extracted for ticker {}", ticker)).await;
|
||||||
|
} else {
|
||||||
|
logger::log_info(&format!("Extracted {} earnings events for {}", events.len(), ticker)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(events)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rejecting Yahoo Cookies
|
||||||
|
async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
|
||||||
|
for _ in 0..10 {
|
||||||
|
let clicked: bool = client
|
||||||
|
.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const btn = document.querySelector('#consent-page .reject-all');
|
||||||
|
if (btn) {
|
||||||
|
btn.click();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
})()"#,
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if clicked { break; }
|
||||||
|
sleep(TokioDuration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("Rejected Yahoo cookies if button existed").await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -2,7 +2,9 @@
|
|||||||
pub mod types;
|
pub mod types;
|
||||||
pub mod scraper;
|
pub mod scraper;
|
||||||
pub mod storage;
|
pub mod storage;
|
||||||
pub mod update;
|
|
||||||
pub mod helpers;
|
pub mod helpers;
|
||||||
|
|
||||||
|
pub mod update;
|
||||||
|
pub mod yahoo_update_forex;
|
||||||
|
|
||||||
pub use update::run_full_update;
|
pub use update::run_full_update;
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
// src/economic/scraper.rs
|
// src/economic/scraper.rs
|
||||||
use super::types::{EconomicEvent};
|
use super::types::{EconomicEvent};
|
||||||
|
use crate::logger;
|
||||||
use fantoccini::Client;
|
use fantoccini::Client;
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
|
|
||||||
@@ -7,17 +8,11 @@ const EXTRACTION_JS: &str = include_str!("extraction_script.js");
|
|||||||
|
|
||||||
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
||||||
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
|
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
|
||||||
//dismiss_overlays(client).await?;
|
dismiss_overlays(client).await?;
|
||||||
|
|
||||||
/*if let Ok(tab) = client.find(fantoccini::Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
|
||||||
tab.click().await?;
|
|
||||||
println!("High importance tab selected");
|
|
||||||
sleep(Duration::from_secs(2)).await;
|
|
||||||
}*/
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/*pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
|
pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
|
||||||
for _ in 0..10 {
|
for _ in 0..10 {
|
||||||
let removed: bool = client
|
let removed: bool = client
|
||||||
.execute(
|
.execute(
|
||||||
@@ -38,7 +33,7 @@ pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
|||||||
sleep(Duration::from_millis(500)).await;
|
sleep(Duration::from_millis(500)).await;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}*/
|
}
|
||||||
|
|
||||||
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
|
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||||
let script = format!(
|
let script = format!(
|
||||||
@@ -78,6 +73,6 @@ pub async fn extract_events(client: &Client) -> anyhow::Result<Vec<EconomicEvent
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
println!("Extracted {} high-impact events", events.len());
|
logger::log_info(&format!("Extracted {} high-impact events", events.len())).await;
|
||||||
Ok(events)
|
Ok(events)
|
||||||
}
|
}
|
||||||
@@ -1,12 +1,17 @@
|
|||||||
// src/economic/storage.rs
|
// src/economic/storage.rs
|
||||||
use super::types::*;
|
use super::types::*;
|
||||||
use super::helpers::*;
|
use super::helpers::*;
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use chrono::{NaiveDate, Datelike};
|
use chrono::{NaiveDate, Datelike};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use serde_json;
|
||||||
|
|
||||||
pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
const MAX_EVENTS_PER_FILE: usize = 3000;
|
||||||
let dir = std::path::Path::new("data/economic/events");
|
|
||||||
|
pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> {
|
||||||
|
let dir = paths.economic_events_dir();
|
||||||
let mut chunks = Vec::new();
|
let mut chunks = Vec::new();
|
||||||
|
|
||||||
if dir.exists() {
|
if dir.exists() {
|
||||||
@@ -16,83 +21,184 @@ pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
|||||||
if path.extension().map(|e| e == "json").unwrap_or(false) {
|
if path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||||
if name.starts_with("chunk_") {
|
if name.starts_with("chunk_") {
|
||||||
if let Some(content) = fs::read_to_string(&path).await.ok() {
|
// Don't load the events here, just record the chunk info
|
||||||
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
|
||||||
let start = name[6..16].to_string();
|
let start = name[6..16].to_string();
|
||||||
let end = name[17..27].to_string();
|
let end = name[17..27].to_string();
|
||||||
chunks.push(ChunkInfo { start_date: start, end_date: end, path, event_count: events.len() });
|
chunks.push(ChunkInfo {
|
||||||
}
|
start_date: start,
|
||||||
}
|
end_date: end,
|
||||||
|
path,
|
||||||
|
event_count: 0 // We'll count later if needed
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
chunks.sort_by_key(|c| c.start_date.clone());
|
chunks.sort_by_key(|c| c.start_date.clone());
|
||||||
|
logger::log_info(&format!("Economic Storage: Found {} event chunks", chunks.len())).await;
|
||||||
Ok(chunks)
|
Ok(chunks)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMap<String, EconomicEvent>> {
|
/// Stream events from a single chunk file
|
||||||
let mut map = HashMap::new();
|
pub async fn stream_chunk_events(
|
||||||
for chunk in chunks {
|
chunk: &ChunkInfo,
|
||||||
|
callback: impl Fn(EconomicEvent) -> anyhow::Result<()>
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
let content = fs::read_to_string(&chunk.path).await?;
|
let content = fs::read_to_string(&chunk.path).await?;
|
||||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||||
for e in events {
|
let count = events.len();
|
||||||
map.insert(event_key(&e), e);
|
|
||||||
|
for event in events {
|
||||||
|
callback(event)?;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
Ok(map)
|
Ok(count)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
/// Load events in batches to avoid memory explosion
|
||||||
let dir = std::path::Path::new("data/economic/events");
|
pub async fn load_events_in_batches(
|
||||||
|
chunks: &[ChunkInfo],
|
||||||
|
batch_size: usize,
|
||||||
|
) -> anyhow::Result<impl Iterator<Item = (String, EconomicEvent)>> {
|
||||||
|
let mut all_events = Vec::new();
|
||||||
|
|
||||||
|
for chunk in chunks {
|
||||||
|
logger::log_info(&format!("Loading chunk: {:?}", chunk.path.file_name())).await;
|
||||||
|
|
||||||
|
let content = fs::read_to_string(&chunk.path).await?;
|
||||||
|
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
for e in events {
|
||||||
|
all_events.push((event_key(&e), e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we've accumulated enough, yield them
|
||||||
|
if all_events.len() >= batch_size {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Loaded {} events in batch", all_events.len())).await;
|
||||||
|
Ok(all_events.into_iter())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a lightweight index instead of loading all events
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct EventIndex {
|
||||||
|
pub key: String,
|
||||||
|
pub identity_key: String,
|
||||||
|
pub date: String,
|
||||||
|
pub chunk_file: std::path::PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn build_event_index(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<EventIndex>> {
|
||||||
|
let mut index = Vec::new();
|
||||||
|
|
||||||
|
for chunk in chunks {
|
||||||
|
logger::log_info(&format!("Indexing chunk: {:?}", chunk.path.file_name())).await;
|
||||||
|
|
||||||
|
let content = fs::read_to_string(&chunk.path).await?;
|
||||||
|
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
for e in events {
|
||||||
|
index.push(EventIndex {
|
||||||
|
key: event_key(&e),
|
||||||
|
identity_key: identity_key(&e),
|
||||||
|
date: e.date.clone(),
|
||||||
|
chunk_file: chunk.path.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Built index with {} entries", index.len())).await;
|
||||||
|
Ok(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up a specific event by loading only its chunk
|
||||||
|
pub async fn lookup_event_by_key(key: &str, index: &[EventIndex]) -> anyhow::Result<Option<EconomicEvent>> {
|
||||||
|
// Find which chunk contains this event
|
||||||
|
let entry = index.iter().find(|e| e.key == key);
|
||||||
|
|
||||||
|
if let Some(entry) = entry {
|
||||||
|
// Load only that chunk
|
||||||
|
let content = fs::read_to_string(&entry.chunk_file).await?;
|
||||||
|
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
// Find the specific event
|
||||||
|
Ok(events.into_iter().find(|e| event_key(e) == key))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save events in smaller, more manageable chunks
|
||||||
|
pub async fn save_optimized_chunks(
|
||||||
|
paths: &DataPaths,
|
||||||
|
events: Vec<EconomicEvent> // Changed from HashMap to Vec
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let dir = paths.economic_events_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
|
|
||||||
// Delete all old chunk files to prevent duplicates and overlaps
|
logger::log_info("Economic Storage: Removing old chunk files...").await;
|
||||||
println!("Removing old chunks...");
|
|
||||||
|
|
||||||
let mut entries = fs::read_dir(dir).await?;
|
let mut entries = fs::read_dir(dir).await?;
|
||||||
|
let mut removed_count = 0;
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||||
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||||
fs::remove_file(&path).await?;
|
fs::remove_file(&path).await?;
|
||||||
|
removed_count += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
logger::log_info(&format!("Economic Storage: Removed {} old chunk files", removed_count)).await;
|
||||||
|
|
||||||
|
let mut sorted = events;
|
||||||
|
sorted.sort_by(|a, b| a.date.cmp(&b.date));
|
||||||
|
|
||||||
|
// Save in smaller chunks
|
||||||
|
let mut chunk_num = 0;
|
||||||
|
for chunk in sorted.chunks(MAX_EVENTS_PER_FILE) {
|
||||||
|
save_chunk_vec(chunk, dir, chunk_num).await?;
|
||||||
|
chunk_num += 1;
|
||||||
|
|
||||||
|
// Allow other tasks to run
|
||||||
|
tokio::task::yield_now().await;
|
||||||
|
}
|
||||||
|
|
||||||
let mut sorted: Vec<_> = events.into_values().collect();
|
logger::log_info(&format!("Economic Storage: Saved {} chunks to {:?}", chunk_num, dir)).await;
|
||||||
sorted.sort_by_key(|e| e.date.clone());
|
|
||||||
|
|
||||||
let mut chunk: Vec<EconomicEvent> = Vec::new();
|
|
||||||
const MAX_EVENTS_PER_CHUNK: usize = ( 30000 / 2 ) / 11; // (30000 - 2) / 11 = 2727
|
|
||||||
|
|
||||||
for e in sorted {
|
|
||||||
if !chunk.is_empty() && chunk.len() >= MAX_EVENTS_PER_CHUNK {
|
|
||||||
save_chunk(&chunk, dir).await?;
|
|
||||||
chunk.clear();
|
|
||||||
}
|
|
||||||
chunk.push(e);
|
|
||||||
}
|
|
||||||
if !chunk.is_empty() {
|
|
||||||
save_chunk(&chunk, dir).await?;
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::Result<()> {
|
async fn save_chunk_vec(events: &[EconomicEvent], dir: &std::path::Path, chunk_num: usize) -> anyhow::Result<()> {
|
||||||
let start = events.iter().map(|e| &e.date).min().unwrap().clone();
|
if events.is_empty() {
|
||||||
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
return Ok(());
|
||||||
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
}
|
||||||
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
|
||||||
|
let start = &events[0].date;
|
||||||
|
let end = &events[events.len() - 1].date;
|
||||||
|
let path = dir.join(format!("chunk_{:04}_{}_{}.json", chunk_num, start, end));
|
||||||
|
|
||||||
|
// Write incrementally to avoid large memory allocation
|
||||||
|
let json = serde_json::to_string_pretty(events)?;
|
||||||
|
fs::write(&path, json).await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!("Economic Storage: Saved chunk {} - {} ({} events)", start, end, events.len())).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
pub async fn save_changes(paths: &DataPaths, changes: &[EventChange]) -> anyhow::Result<()> {
|
||||||
if changes.is_empty() { return Ok(()); }
|
if changes.is_empty() {
|
||||||
let dir = std::path::Path::new("economic_event_changes");
|
logger::log_info("Economic Storage: No changes to save").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let dir = paths.economic_changes_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!("Economic Storage: Saving {} changes to {:?}", changes.len(), dir)).await;
|
||||||
|
|
||||||
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
|
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
|
||||||
for c in changes {
|
for c in changes {
|
||||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||||
@@ -107,8 +213,10 @@ pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
|||||||
let s = fs::read_to_string(&path).await?;
|
let s = fs::read_to_string(&path).await?;
|
||||||
serde_json::from_str(&s).unwrap_or_default()
|
serde_json::from_str(&s).unwrap_or_default()
|
||||||
} else { vec![] };
|
} else { vec![] };
|
||||||
all.extend(list);
|
all.extend(list.clone());
|
||||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||||
|
logger::log_info(&format!("Economic Storage: Saved {} changes for month {}", list.len(), month)).await;
|
||||||
}
|
}
|
||||||
|
logger::log_info("Economic Storage: All changes saved successfully").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1,70 +1,146 @@
|
|||||||
// src/economic/update.rs
|
// src/economic/update.rs
|
||||||
use super::{scraper::*, storage::*, helpers::*, types::*};
|
use super::{scraper::*, storage::*, helpers::*, types::*};
|
||||||
use crate::{config::Config, scraper::webdriver::ScrapeTask};
|
use crate::check_shutdown;
|
||||||
use crate::scraper::webdriver::ChromeDriverPool;
|
use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
|
||||||
use chrono::{Local};
|
use chrono::{Local};
|
||||||
use std::sync::Arc;
|
use std::sync::{Arc, atomic::{AtomicBool}};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Runs the full update for economic data using streaming to minimize memory usage
|
||||||
|
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>, shutdown_flag: &Arc<AtomicBool>) -> anyhow::Result<()> {
|
||||||
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
|
logger::log_info("Economic Update: Initializing...").await;
|
||||||
|
|
||||||
/// Runs the full update for economic data, using the provided ChromeDriver pool.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `config` - The application configuration.
|
|
||||||
/// * `pool` - Shared pool of ChromeDriver instances for scraping.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if scraping, loading, or saving fails.
|
|
||||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
|
||||||
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
||||||
let end_date = config.target_end_date();
|
let end_date = config.target_end_date();
|
||||||
|
|
||||||
let chunks = scan_existing_chunks().await?;
|
logger::log_info("=== Economic Update ===").await;
|
||||||
let mut events = load_existing_events(&chunks).await?;
|
|
||||||
println!("Loaded {} events from {} chunks", events.len(), chunks.len());
|
|
||||||
|
|
||||||
let start_date = if events.is_empty() {
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
// Step 1: Build lightweight index instead of loading all events
|
||||||
|
logger::log_info("Step 1: Building event index...").await;
|
||||||
|
let chunks = scan_existing_chunks(&paths).await?;
|
||||||
|
let event_index = build_event_index(&chunks).await?;
|
||||||
|
logger::log_info(&format!(" Economic Update: Indexed {} events from {} chunks",
|
||||||
|
event_index.len(), chunks.len())).await;
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
// Step 2: Determine start date
|
||||||
|
let start_date = if event_index.is_empty() {
|
||||||
|
logger::log_warn("Step 2: No existing events found, starting from config date").await;
|
||||||
config.economic_start_date.clone()
|
config.economic_start_date.clone()
|
||||||
} else if events.values().any(|e| e.date >= today_str) {
|
} else {
|
||||||
|
// Find the latest date in the index
|
||||||
|
let max_date = event_index.iter()
|
||||||
|
.map(|e| &e.date)
|
||||||
|
.max()
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or(today_str.clone());
|
||||||
|
|
||||||
|
if max_date >= today_str {
|
||||||
|
logger::log_info(" Events exist for today, starting from today").await;
|
||||||
today_str.clone()
|
today_str.clone()
|
||||||
} else {
|
} else {
|
||||||
events.values()
|
let next = chrono::NaiveDate::parse_from_str(&max_date, "%Y-%m-%d")
|
||||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
.ok()
|
||||||
.max()
|
|
||||||
.and_then(|d| d.succ_opt())
|
.and_then(|d| d.succ_opt())
|
||||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
.unwrap_or(today_str.clone())
|
.unwrap_or(today_str.clone());
|
||||||
|
logger::log_info(&format!(" Resuming from: {}", next)).await;
|
||||||
|
next
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("Scraping economic events: {} → {}", start_date, end_date);
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
// Pass the pool to the scraping function
|
// Step 3: Scrape new events in batches
|
||||||
let new_events_all = scrape_all_economic_events(&start_date, &end_date, pool).await?;
|
logger::log_info(&format!("Step 3: Scraping events from {} → {}", start_date, end_date)).await;
|
||||||
|
let new_events = scrape_all_economic_events(&start_date, &end_date, pool).await?;
|
||||||
|
logger::log_info(&format!(" Scraped {} new events", new_events.len())).await;
|
||||||
|
|
||||||
// Process all at once or in batches
|
check_shutdown!(shutdown_flag);
|
||||||
let result = process_batch(&new_events_all, &mut events, &today_str);
|
|
||||||
let total_changes = result.changes.len();
|
// Step 4: Process events in streaming fashion
|
||||||
save_changes(&result.changes).await?;
|
logger::log_info(&format!("Step 4: Detecting changes")).await;
|
||||||
|
let (changes, updated_events) = process_events_streaming(&chunks, &new_events, &today_str).await?;
|
||||||
|
logger::log_info(&format!(" Detected {} changes", changes.len())).await;
|
||||||
|
if !changes.is_empty() {
|
||||||
|
logger::log_info(&format!(" Saving {} changes to log", changes.len())).await;
|
||||||
|
save_changes(&paths, &changes).await?;
|
||||||
|
logger::log_info(" Changes saved successfully").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
check_shutdown!(shutdown_flag);
|
||||||
|
|
||||||
|
// Step 5: Save consolidated events
|
||||||
|
logger::log_info(&format!("Step 5: Saving {} total events to chunks", updated_events.len())).await;
|
||||||
|
save_optimized_chunks(&paths, updated_events).await?;
|
||||||
|
logger::log_info(&format!(" ✓ Economic update complete — {} changes detected", changes.len())).await;
|
||||||
|
|
||||||
save_optimized_chunks(events).await?;
|
|
||||||
println!("Economic update complete — {} changes detected", total_changes);
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scrapes all economic events from start to end date using a dedicated ScrapeTask with the provided pool.
|
/// Process events using streaming to minimize memory usage
|
||||||
///
|
async fn process_events_streaming(
|
||||||
/// This function creates a ScrapeTask to navigate to the Finanzen.net page, prepare it,
|
chunks: &[ChunkInfo],
|
||||||
/// and then loop through date ranges to extract events.
|
new_events: &[EconomicEvent],
|
||||||
///
|
today: &str,
|
||||||
/// # Arguments
|
) -> anyhow::Result<(Vec<EventChange>, Vec<EconomicEvent>)> {
|
||||||
/// * `start` - Start date in YYYY-MM-DD.
|
let mut all_changes = Vec::new();
|
||||||
/// * `end` - End date in YYYY-MM-DD.
|
let mut final_events: HashMap<String, EconomicEvent> = HashMap::new();
|
||||||
/// * `pool` - Shared pool of ChromeDriver instances.
|
|
||||||
///
|
// Step 1: Load existing events in batches
|
||||||
/// # Returns
|
logger::log_info("Processing existing events in batches...").await;
|
||||||
/// A vector of all extracted EconomicEvent structs.
|
|
||||||
///
|
for chunk in chunks {
|
||||||
/// # Errors
|
logger::log_info(&format!("Loading chunk: {:?}", chunk.path.file_name())).await;
|
||||||
/// Returns an error if task execution fails or extraction issues occur.
|
|
||||||
pub async fn scrape_all_economic_events(start: &str, end: &str, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<Vec<EconomicEvent>> {
|
let content = tokio::fs::read_to_string(&chunk.path).await?;
|
||||||
|
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
// Add to final events map
|
||||||
|
for e in events {
|
||||||
|
final_events.insert(event_key(&e), e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear memory periodically
|
||||||
|
if final_events.len() > 10000 {
|
||||||
|
logger::log_info(&format!("Loaded {} events so far...", final_events.len())).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Loaded {} existing events total", final_events.len())).await;
|
||||||
|
|
||||||
|
// Step 2: Process new events in batches
|
||||||
|
logger::log_info("Processing new events...").await;
|
||||||
|
|
||||||
|
for (idx, batch) in new_events.chunks(500).enumerate() {
|
||||||
|
logger::log_info(&format!("Processing batch {} ({} events)", idx + 1, batch.len())).await;
|
||||||
|
|
||||||
|
let batch_result = process_batch(batch, &mut final_events, today);
|
||||||
|
all_changes.extend(batch_result.changes);
|
||||||
|
|
||||||
|
// Yield to prevent blocking
|
||||||
|
tokio::task::yield_now().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Processing complete. Total events: {}", final_events.len())).await;
|
||||||
|
|
||||||
|
// Convert HashMap to Vec for saving
|
||||||
|
let events_vec: Vec<EconomicEvent> = final_events.into_values().collect();
|
||||||
|
|
||||||
|
Ok((all_changes, events_vec))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scrapes all economic events from start to end date
|
||||||
|
pub async fn scrape_all_economic_events(
|
||||||
|
start: &str,
|
||||||
|
end: &str,
|
||||||
|
pool: &Arc<ChromeDriverPool>
|
||||||
|
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/".to_string();
|
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/".to_string();
|
||||||
let start_clone = start.to_string();
|
let start_clone = start.to_string();
|
||||||
let end_clone = end.to_string();
|
let end_clone = end.to_string();
|
||||||
@@ -78,9 +154,18 @@ pub async fn scrape_all_economic_events(start: &str, end: &str, pool: &Arc<Chrom
|
|||||||
set_date_range(&client, ¤t, &end_clone).await?;
|
set_date_range(&client, ¤t, &end_clone).await?;
|
||||||
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
||||||
let new_events = extract_events(&client).await?;
|
let new_events = extract_events(&client).await?;
|
||||||
if new_events.is_empty() { break; }
|
|
||||||
|
if new_events.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
all_events.extend(new_events.clone());
|
all_events.extend(new_events.clone());
|
||||||
|
|
||||||
|
// Prevent memory buildup - process in chunks if too large
|
||||||
|
if all_events.len() > 5000 {
|
||||||
|
logger::log_info(&format!("Scraped {} events so far, continuing...", all_events.len())).await;
|
||||||
|
}
|
||||||
|
|
||||||
let next = new_events.iter()
|
let next = new_events.iter()
|
||||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||||
.max()
|
.max()
|
||||||
@@ -91,22 +176,23 @@ pub async fn scrape_all_economic_events(start: &str, end: &str, pool: &Arc<Chrom
|
|||||||
if next > end_clone { break; }
|
if next > end_clone { break; }
|
||||||
current = next;
|
current = next;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(all_events)
|
Ok(all_events)
|
||||||
});
|
});
|
||||||
|
|
||||||
// Use the pool for execution
|
|
||||||
task.execute_with_pool(pool).await
|
task.execute_with_pool(pool).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Process a batch of events and detect changes
|
||||||
pub fn process_batch(
|
pub fn process_batch(
|
||||||
new_events: &[EconomicEvent],
|
new_events: &[EconomicEvent],
|
||||||
existing: &mut std::collections::HashMap<String, EconomicEvent>,
|
existing: &mut HashMap<String, EconomicEvent>,
|
||||||
today: &str,
|
today: &str,
|
||||||
) -> ScrapeResult {
|
) -> ScrapeResult {
|
||||||
let mut changes = Vec::new();
|
let mut changes = Vec::new();
|
||||||
let mut removed = std::collections::HashSet::new();
|
let mut removed = std::collections::HashSet::new();
|
||||||
|
|
||||||
let identity_map = build_identity_lookup(existing);
|
//let identity_map = build_identity_lookup(existing);
|
||||||
let date_map = build_date_event_lookup(existing);
|
let date_map = build_date_event_lookup(existing);
|
||||||
|
|
||||||
for new in new_events {
|
for new in new_events {
|
||||||
|
|||||||
477
src/economic/yahoo_update_forex.rs
Normal file
477
src/economic/yahoo_update_forex.rs
Normal file
@@ -0,0 +1,477 @@
|
|||||||
|
// src/forex/update_forex.rs
|
||||||
|
use crate::config::Config;
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::integrity::{DataStage, StateManager, directory_reference};
|
||||||
|
use crate::util::logger;
|
||||||
|
use crate::scraper::yahoo::{YahooClientPool};
|
||||||
|
use crate::corporate::types::*;
|
||||||
|
|
||||||
|
use std::result::Result::Ok;
|
||||||
|
use chrono::{TimeZone, Utc};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||||
|
use tokio::fs::{OpenOptions};
|
||||||
|
use tokio::io::{AsyncWriteExt};
|
||||||
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use serde_json::json;
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
|
||||||
|
/// Currency information
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct CurrencyPair {
|
||||||
|
code: String, // e.g., "EUR", "JPY"
|
||||||
|
name: String, // e.g., "Euro", "Japanese Yen"
|
||||||
|
yahoo_symbol: String, // e.g., "USDEUR=X", "USDJPY=X"
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CurrencyPair {
|
||||||
|
fn new(code: &str, name: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
code: code.to_string(),
|
||||||
|
name: name.to_string(),
|
||||||
|
yahoo_symbol: format!("USD{}=X", code),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get list of currency pairs to fetch (USD as base currency)
|
||||||
|
fn get_currency_pairs() -> Vec<CurrencyPair> {
|
||||||
|
vec![
|
||||||
|
CurrencyPair::new("EUR", "Euro"),
|
||||||
|
CurrencyPair::new("TRY", "Turkish Lira"),
|
||||||
|
CurrencyPair::new("CHF", "Swiss Franc"),
|
||||||
|
CurrencyPair::new("SEK", "Swedish Krona"),
|
||||||
|
CurrencyPair::new("TWD", "New Taiwan Dollar"),
|
||||||
|
CurrencyPair::new("AUD", "Australian Dollar"),
|
||||||
|
CurrencyPair::new("GBP", "British Pound"), // Fixed: GBp -> GBP
|
||||||
|
CurrencyPair::new("NOK", "Norwegian Krone"),
|
||||||
|
CurrencyPair::new("CAD", "Canadian Dollar"),
|
||||||
|
CurrencyPair::new("CZK", "Czech Koruna"),
|
||||||
|
CurrencyPair::new("SGD", "Singapore Dollar"),
|
||||||
|
CurrencyPair::new("ISK", "Icelandic Króna"),
|
||||||
|
CurrencyPair::new("ZAR", "South African Rand"), // Fixed: ZAc -> ZAR
|
||||||
|
CurrencyPair::new("JPY", "Japanese Yen"),
|
||||||
|
CurrencyPair::new("PLN", "Polish Złoty"),
|
||||||
|
CurrencyPair::new("DKK", "Danish Krone"),
|
||||||
|
CurrencyPair::new("HKD", "Hong Kong Dollar"),
|
||||||
|
CurrencyPair::new("ILS", "Israeli Shekel"), // Fixed: ILA -> ILS
|
||||||
|
CurrencyPair::new("RON", "Romanian Leu"),
|
||||||
|
CurrencyPair::new("KWD", "Kuwaiti Dinar"), // Fixed: KWF -> KWD
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Yahoo Collect Foreign Exchange Charts WITH ABORT-SAFE INCREMENTAL PERSISTENCE
|
||||||
|
///
|
||||||
|
/// # Features
|
||||||
|
/// - Graceful shutdown (abort-safe)
|
||||||
|
/// - Task panic isolation (tasks fail independently)
|
||||||
|
/// - Crash-safe persistence (checkpoint + log with fsync)
|
||||||
|
/// - Smart skip logic (only process incomplete data)
|
||||||
|
/// - Uses pending queue instead of retry mechanism
|
||||||
|
///
|
||||||
|
/// # Persistence Strategy
|
||||||
|
/// - Checkpoint: fx_rates_collected.jsonl (atomic state)
|
||||||
|
/// - Log: fx_rates_updates.log (append-only updates)
|
||||||
|
/// - On restart: Load checkpoint + replay log
|
||||||
|
/// - Periodic checkpoints (every 10 currencies)
|
||||||
|
/// - Batched fsync (every 5 writes or 10 seconds)
|
||||||
|
pub async fn collect_fx_rates(
|
||||||
|
paths: &DataPaths,
|
||||||
|
_config: &Config,
|
||||||
|
yahoo_pool: Arc<YahooClientPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
|
// Configuration constants
|
||||||
|
const CHECKPOINT_INTERVAL: usize = 10;
|
||||||
|
const FSYNC_BATCH_SIZE: usize = 5;
|
||||||
|
const FSYNC_INTERVAL_SECS: u64 = 10;
|
||||||
|
const CONCURRENCY_LIMIT: usize = 10; // Limit parallel fetch tasks
|
||||||
|
|
||||||
|
let data_path = paths.data_dir();
|
||||||
|
|
||||||
|
// File paths
|
||||||
|
let output_path = data_path.join("economic").join("currency");
|
||||||
|
let log_path = data_path.join("fx_rates_updates.log");
|
||||||
|
|
||||||
|
let manager = StateManager::new(paths.integrity_dir()).await?;
|
||||||
|
let step_name = "yahoo_fx_rate_collection_completed";
|
||||||
|
let content_reference = directory_reference(&output_path,
|
||||||
|
Some(vec![
|
||||||
|
"*/chart/*.jsonl".to_string(), // Main pattern for events data
|
||||||
|
"*/chart/data.jsonl".to_string(), // Specific pattern (more precise)
|
||||||
|
]),
|
||||||
|
Some(vec![
|
||||||
|
"*.log".to_string(), // Exclude log files
|
||||||
|
"*.tmp".to_string(), // Exclude temp files
|
||||||
|
"*.bak".to_string(), // Exclude backup files
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
|
||||||
|
if manager.is_step_valid(step_name).await? {
|
||||||
|
logger::log_info(" FX rates collection already completed").await;
|
||||||
|
let count = count_collected_currencies(paths).await?;
|
||||||
|
logger::log_info(&format!(" ✓ Found {} currencies with chart data", count)).await;
|
||||||
|
return Ok(count);
|
||||||
|
}
|
||||||
|
let entry = manager.create_entry(
|
||||||
|
step_name.to_string(),
|
||||||
|
content_reference.clone(),
|
||||||
|
DataStage::Data,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
logger::log_info(" Updating missing forex data...").await;
|
||||||
|
|
||||||
|
// === RECOVERY PHASE: Track collected currencies ===
|
||||||
|
let mut collected_currencies: HashSet<String> = HashSet::new();
|
||||||
|
|
||||||
|
if log_path.exists() {
|
||||||
|
logger::log_info("Loading FX rates collection progress from log...").await;
|
||||||
|
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||||
|
|
||||||
|
for line in log_content.lines() {
|
||||||
|
if line.trim().is_empty() || !line.ends_with('}') {
|
||||||
|
continue; // Skip incomplete lines
|
||||||
|
}
|
||||||
|
|
||||||
|
match serde_json::from_str::<serde_json::Value>(line) {
|
||||||
|
Ok(entry) => {
|
||||||
|
if let Some(code) = entry.get("currency_code").and_then(|v| v.as_str()) {
|
||||||
|
if entry.get("status").and_then(|v| v.as_str()) == Some("collected") {
|
||||||
|
collected_currencies.insert(code.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger::log_info(&format!("Loaded {} collected currencies from log", collected_currencies.len())).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all currency pairs
|
||||||
|
let currency_pairs = get_currency_pairs();
|
||||||
|
let total_currencies = currency_pairs.len();
|
||||||
|
logger::log_info(&format!("Found {} currency pairs to collect", total_currencies)).await;
|
||||||
|
|
||||||
|
// Filter currencies that need collection
|
||||||
|
let pending_pairs: Vec<CurrencyPair> = currency_pairs
|
||||||
|
.into_iter()
|
||||||
|
.filter(|pair| !collected_currencies.contains(&pair.code))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let pending_count = pending_pairs.len();
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" {} already collected, {} pending",
|
||||||
|
collected_currencies.len(),
|
||||||
|
pending_count
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
if pending_count == 0 {
|
||||||
|
logger::log_info(" ✓ All currencies already collected").await;
|
||||||
|
manager.mark_valid(entry).await?;
|
||||||
|
return Ok(collected_currencies.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
// === PROCESSING PHASE: Collect FX rates ===
|
||||||
|
|
||||||
|
// Shared counters
|
||||||
|
let processed_count = Arc::new(AtomicUsize::new(collected_currencies.len()));
|
||||||
|
let success_count = Arc::new(AtomicUsize::new(collected_currencies.len()));
|
||||||
|
let failed_count = Arc::new(AtomicUsize::new(0));
|
||||||
|
|
||||||
|
// Log writer channel with batching and fsync
|
||||||
|
let (log_tx, mut log_rx) = mpsc::channel::<LogCommand>(1000);
|
||||||
|
|
||||||
|
// Spawn log writer task
|
||||||
|
let log_writer_handle = {
|
||||||
|
let log_path = log_path.clone();
|
||||||
|
let processed_count = Arc::clone(&processed_count);
|
||||||
|
let total_currencies = total_currencies;
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut log_file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)
|
||||||
|
.await
|
||||||
|
.expect("Failed to open log file");
|
||||||
|
|
||||||
|
let mut write_count = 0;
|
||||||
|
let mut last_fsync = tokio::time::Instant::now();
|
||||||
|
|
||||||
|
while let Some(cmd) = log_rx.recv().await {
|
||||||
|
match cmd {
|
||||||
|
LogCommand::Write(entry) => {
|
||||||
|
let json_line = serde_json::to_string(&entry).expect("Serialization failed");
|
||||||
|
log_file.write_all(json_line.as_bytes()).await.expect("Write failed");
|
||||||
|
log_file.write_all(b"\n").await.expect("Write failed");
|
||||||
|
|
||||||
|
write_count += 1;
|
||||||
|
|
||||||
|
// Batched fsync
|
||||||
|
if write_count >= FSYNC_BATCH_SIZE
|
||||||
|
|| last_fsync.elapsed().as_secs() >= FSYNC_INTERVAL_SECS
|
||||||
|
{
|
||||||
|
log_file.flush().await.expect("Flush failed");
|
||||||
|
log_file.sync_all().await.expect("Fsync failed");
|
||||||
|
write_count = 0;
|
||||||
|
last_fsync = tokio::time::Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LogCommand::Checkpoint => {
|
||||||
|
// Force fsync on checkpoint
|
||||||
|
log_file.flush().await.expect("Flush failed");
|
||||||
|
log_file.sync_all().await.expect("Fsync failed");
|
||||||
|
write_count = 0;
|
||||||
|
last_fsync = tokio::time::Instant::now();
|
||||||
|
|
||||||
|
let current = processed_count.load(Ordering::SeqCst);
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" Checkpoint: {}/{} currencies processed",
|
||||||
|
current, total_currencies
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
LogCommand::Shutdown => {
|
||||||
|
// Final fsync before shutdown
|
||||||
|
log_file.flush().await.expect("Flush failed");
|
||||||
|
log_file.sync_all().await.expect("Fsync failed");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
// Process currencies concurrently with task panic isolation
|
||||||
|
let mut tasks = FuturesUnordered::new();
|
||||||
|
let mut pending_iter = pending_pairs.into_iter();
|
||||||
|
let semaphore = Arc::new(tokio::sync::Semaphore::new(CONCURRENCY_LIMIT));
|
||||||
|
|
||||||
|
// Initial batch of tasks
|
||||||
|
for _ in 0..CONCURRENCY_LIMIT.min(pending_count) {
|
||||||
|
if let Some(pair) = pending_iter.next() {
|
||||||
|
let task = spawn_collection_task(
|
||||||
|
pair,
|
||||||
|
Arc::clone(&yahoo_pool),
|
||||||
|
paths.clone(),
|
||||||
|
Arc::clone(&processed_count),
|
||||||
|
Arc::clone(&success_count),
|
||||||
|
Arc::clone(&failed_count),
|
||||||
|
log_tx.clone(),
|
||||||
|
Arc::clone(&semaphore),
|
||||||
|
Arc::clone(shutdown_flag),
|
||||||
|
);
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process tasks as they complete and spawn new ones
|
||||||
|
let mut checkpoint_counter = 0;
|
||||||
|
while let Some(_result) = tasks.next().await {
|
||||||
|
// Check for shutdown
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown signal received, stopping FX collection").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spawn new task if more pending
|
||||||
|
if let Some(pair) = pending_iter.next() {
|
||||||
|
let task = spawn_collection_task(
|
||||||
|
pair,
|
||||||
|
Arc::clone(&yahoo_pool),
|
||||||
|
paths.clone(),
|
||||||
|
Arc::clone(&processed_count),
|
||||||
|
Arc::clone(&success_count),
|
||||||
|
Arc::clone(&failed_count),
|
||||||
|
log_tx.clone(),
|
||||||
|
Arc::clone(&semaphore),
|
||||||
|
Arc::clone(shutdown_flag),
|
||||||
|
);
|
||||||
|
tasks.push(task);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Periodic checkpoint
|
||||||
|
checkpoint_counter += 1;
|
||||||
|
if checkpoint_counter % CHECKPOINT_INTERVAL == 0 {
|
||||||
|
let _ = log_tx.send(LogCommand::Checkpoint).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Signal shutdown to log writer
|
||||||
|
let _ = log_tx.send(LogCommand::Shutdown).await;
|
||||||
|
|
||||||
|
// Wait for log writer to finish
|
||||||
|
let _ = log_writer_handle.await;
|
||||||
|
|
||||||
|
// Final statistics
|
||||||
|
let final_success = success_count.load(Ordering::SeqCst);
|
||||||
|
let final_failed = failed_count.load(Ordering::SeqCst);
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" FX collection complete: {} succeeded, {} failed",
|
||||||
|
final_success, final_failed
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Mark as complete if not shutdown
|
||||||
|
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
manager.mark_valid(entry).await?;
|
||||||
|
}
|
||||||
|
Ok(final_success)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn a collection task with panic isolation
|
||||||
|
fn spawn_collection_task(
|
||||||
|
pair: CurrencyPair,
|
||||||
|
yahoo_pool: Arc<YahooClientPool>,
|
||||||
|
paths: DataPaths,
|
||||||
|
processed_count: Arc<AtomicUsize>,
|
||||||
|
success_count: Arc<AtomicUsize>,
|
||||||
|
failed_count: Arc<AtomicUsize>,
|
||||||
|
log_tx: mpsc::Sender<LogCommand>,
|
||||||
|
semaphore: Arc<tokio::sync::Semaphore>,
|
||||||
|
shutdown_flag: Arc<AtomicBool>,
|
||||||
|
) -> tokio::task::JoinHandle<()> {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
// Acquire semaphore permit
|
||||||
|
let _permit = semaphore.acquire().await.expect("Semaphore closed");
|
||||||
|
|
||||||
|
// Check shutdown before processing
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform collection (panic-isolated)
|
||||||
|
let result = collect_currency_chart(&pair, &yahoo_pool, &paths).await;
|
||||||
|
|
||||||
|
// Update counters
|
||||||
|
processed_count.fetch_add(1, Ordering::SeqCst);
|
||||||
|
|
||||||
|
let status = match result {
|
||||||
|
Ok(_) => {
|
||||||
|
success_count.fetch_add(1, Ordering::SeqCst);
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" ✓ Collected {} ({})",
|
||||||
|
pair.code, pair.name
|
||||||
|
)).await;
|
||||||
|
"collected"
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
failed_count.fetch_add(1, Ordering::SeqCst);
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
" ✗ Failed to collect {} ({}): {}",
|
||||||
|
pair.code, pair.name, e
|
||||||
|
)).await;
|
||||||
|
"failed"
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Log result
|
||||||
|
let log_entry = json!({
|
||||||
|
"currency_code": pair.code,
|
||||||
|
"currency_name": pair.name,
|
||||||
|
"yahoo_symbol": pair.yahoo_symbol,
|
||||||
|
"status": status,
|
||||||
|
"timestamp": Utc::now().to_rfc3339(),
|
||||||
|
});
|
||||||
|
|
||||||
|
let _ = log_tx.send(LogCommand::Write(log_entry)).await;
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect chart data for a single currency pair
|
||||||
|
async fn collect_currency_chart(
|
||||||
|
pair: &CurrencyPair,
|
||||||
|
yahoo_pool: &Arc<YahooClientPool>,
|
||||||
|
paths: &DataPaths,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
// Get historical data from year 2000 to now
|
||||||
|
let now = Utc::now().timestamp();
|
||||||
|
let start_2000 = Utc
|
||||||
|
.with_ymd_and_hms(2000, 1, 1, 0, 0, 0)
|
||||||
|
.unwrap()
|
||||||
|
.timestamp();
|
||||||
|
|
||||||
|
// Fetch chart data from Yahoo
|
||||||
|
let chart_data = yahoo_pool.get_chart_data(
|
||||||
|
&pair.yahoo_symbol,
|
||||||
|
"1d", // Daily interval
|
||||||
|
start_2000,
|
||||||
|
now,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
// Validate we got data
|
||||||
|
if chart_data.quotes.is_empty() {
|
||||||
|
return Err(anyhow::anyhow!(
|
||||||
|
"No chart data available for {} ({})",
|
||||||
|
pair.code,
|
||||||
|
pair.yahoo_symbol
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save chart data to currency directory
|
||||||
|
save_currency_chart(paths, &pair.code, &chart_data).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save currency chart data to filesystem
|
||||||
|
async fn save_currency_chart(
|
||||||
|
paths: &DataPaths,
|
||||||
|
currency_code: &str,
|
||||||
|
chart_data: &ChartData,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
use tokio::fs;
|
||||||
|
|
||||||
|
// Create directory structure: data/economic/currency/{code}/chart/
|
||||||
|
let economic_dir = paths.data_dir().join("economic");
|
||||||
|
let currency_dir = economic_dir.join("currency").join(currency_code);
|
||||||
|
let chart_dir = currency_dir.join("chart");
|
||||||
|
|
||||||
|
fs::create_dir_all(&chart_dir).await?;
|
||||||
|
|
||||||
|
// Write chart data to data.jsonl
|
||||||
|
let data_path = chart_dir.join("data.jsonl");
|
||||||
|
let json_line = serde_json::to_string(chart_data)?;
|
||||||
|
|
||||||
|
let mut file = fs::File::create(&data_path).await?;
|
||||||
|
file.write_all(json_line.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
file.sync_all().await?; // Ensure data is persisted
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count collected currencies (currencies with chart data)
|
||||||
|
async fn count_collected_currencies(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||||
|
let currency_dir = paths.data_dir().join("economic").join("currency");
|
||||||
|
|
||||||
|
if !currency_dir.exists() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut count = 0;
|
||||||
|
let mut entries = tokio::fs::read_dir(¤cy_dir).await?;
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_dir() {
|
||||||
|
let chart_file = path.join("chart").join("data.jsonl");
|
||||||
|
|
||||||
|
if chart_file.exists() {
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Log command enum
|
||||||
|
enum LogCommand {
|
||||||
|
Write(serde_json::Value),
|
||||||
|
Checkpoint,
|
||||||
|
Shutdown,
|
||||||
|
}
|
||||||
21
src/lib.rs
Normal file
21
src/lib.rs
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
// src/lib.rs
|
||||||
|
//! Event Backtest Engine - Core Library
|
||||||
|
//!
|
||||||
|
//! Exposes all public modules for use in examples and tests
|
||||||
|
|
||||||
|
pub mod config;
|
||||||
|
pub mod scraper;
|
||||||
|
pub mod util;
|
||||||
|
pub mod monitoring;
|
||||||
|
pub mod economic;
|
||||||
|
pub mod corporate;
|
||||||
|
|
||||||
|
// Re-export commonly used types for convenience
|
||||||
|
pub use monitoring::{init_monitoring, ConfigSnapshot, MonitoringEvent};
|
||||||
|
pub use config::Config;
|
||||||
|
pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask};
|
||||||
|
pub use util::logger;
|
||||||
|
pub use util::macros;
|
||||||
|
pub use scraper::yahoo::{
|
||||||
|
YahooClient, YahooClientPool, QuoteSummaryModule, QuoteSummary, SearchResult
|
||||||
|
};
|
||||||
384
src/main.rs
384
src/main.rs
@@ -1,43 +1,359 @@
|
|||||||
// src/main.rs
|
use web_scraper::util::integrity::StateManager;
|
||||||
mod economic;
|
// src/main.rs - Cleaned up version with extracted helpers
|
||||||
mod corporate;
|
use web_scraper::{*, scraper, corporate};
|
||||||
mod config;
|
use crate::check_shutdown;
|
||||||
mod util;
|
use anyhow::{Result};
|
||||||
mod scraper;
|
use web_scraper::config::Config;
|
||||||
|
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
|
||||||
use anyhow::Result;
|
|
||||||
use config::Config;
|
|
||||||
use scraper::webdriver::ChromeDriverPool;
|
use scraper::webdriver::ChromeDriverPool;
|
||||||
|
use util::directories::DataPaths;
|
||||||
|
use util::{logger, opnv};
|
||||||
|
use std::fs::{OpenOptions};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::process::Command;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
/// The entry point of the application.
|
// ============================================================================
|
||||||
///
|
// HELPER FUNCTIONS - Extracted to reduce duplication
|
||||||
/// This function loads the configuration, initializes a shared ChromeDriver pool,
|
// ============================================================================
|
||||||
/// and sequentially runs the full updates for corporate and economic data.
|
|
||||||
/// Sequential execution helps prevent resource exhaustion from concurrent
|
|
||||||
/// chromedriver instances and avoids spamming the target websites with too many requests.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
///
|
|
||||||
/// Returns an error if configuration loading fails, pool initialization fails,
|
|
||||||
/// or if either update function encounters an issue (e.g., network errors,
|
|
||||||
/// scraping failures, or chromedriver spawn failures like "program not found").
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() -> Result<()> {
|
|
||||||
let config = Config::load().map_err(|err| {
|
|
||||||
println!("Failed to load Config .env: {}", err);
|
|
||||||
err
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// Initialize the shared ChromeDriver pool once
|
/// Start Docker Desktop on Windows
|
||||||
let pool_size = config.max_parallel_tasks;
|
async fn start_docker_desktop() {
|
||||||
let pool = Arc::new(ChromeDriverPool::new(pool_size).await?);
|
if cfg!(target_os = "windows") {
|
||||||
|
let _ = Command::new("cmd")
|
||||||
|
.args(["/C", "docker desktop start"])
|
||||||
|
.output();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Run economic update first, passing the shared pool
|
/// Shutdown ChromeDriver pool with error handling
|
||||||
economic::run_full_update(&config, &pool).await?;
|
async fn shutdown_chrome_pool(pool: &ChromeDriverPool) {
|
||||||
|
logger::log_info("Shutting down ChromeDriver pool...").await;
|
||||||
|
match pool.shutdown().await {
|
||||||
|
Ok(()) => logger::log_info("✓ ChromeDriver pool shut down successfully").await,
|
||||||
|
Err(e) => logger::log_error(&format!("✗ Pool shutdown error: {}", e)).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Then run corporate update, passing the shared pool
|
/// Shutdown Docker VPN proxy pool with error handling
|
||||||
corporate::run_full_update(&config, &pool).await?;
|
async fn shutdown_proxy_pool(proxy_pool: &DockerVpnProxyPool) {
|
||||||
|
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
||||||
|
match proxy_pool.shutdown().await {
|
||||||
|
Ok(()) => logger::log_info("✓ All Docker VPN containers stopped").await,
|
||||||
|
Err(e) => logger::log_error(&format!("✗ Proxy shutdown error: {}", e)).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Force-kill Chrome and ChromeDriver processes (Windows only)
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
async fn force_kill_chrome_processes() {
|
||||||
|
logger::log_info("Force-killing any remaining Chrome processes...").await;
|
||||||
|
let _ = tokio::process::Command::new("taskkill")
|
||||||
|
.args(["/F", "/IM", "chrome.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
let _ = tokio::process::Command::new("taskkill")
|
||||||
|
.args(["/F", "/IM", "chromedriver.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "windows"))]
|
||||||
|
async fn force_kill_chrome_processes() {
|
||||||
|
// No-op on non-Windows platforms
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Verify Chrome processes are cleaned up (Windows only)
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
async fn verify_chrome_cleanup() {
|
||||||
|
if let Ok(output) = tokio::process::Command::new("tasklist")
|
||||||
|
.args(["/FI", "IMAGENAME eq chrome.exe"])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
|
||||||
|
|
||||||
|
if chrome_count > 0 {
|
||||||
|
logger::log_warn(&format!("⚠️ {} Chrome processes still running after cleanup!", chrome_count)).await;
|
||||||
|
} else {
|
||||||
|
logger::log_info("✓ All Chrome processes cleaned up").await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "windows"))]
|
||||||
|
async fn verify_chrome_cleanup() {
|
||||||
|
// No-op on non-Windows platforms
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Complete cleanup sequence: shutdown pools, cleanup containers, kill processes
|
||||||
|
async fn perform_full_cleanup(
|
||||||
|
pool: &ChromeDriverPool,
|
||||||
|
proxy_pool: Option<&DockerVpnProxyPool>,
|
||||||
|
) {
|
||||||
|
shutdown_chrome_pool(pool).await;
|
||||||
|
|
||||||
|
if let Some(pp) = proxy_pool {
|
||||||
|
shutdown_proxy_pool(pp).await;
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
force_kill_chrome_processes().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create temporary ChromeDriver pool, fetch VPN credentials, and cleanup
|
||||||
|
async fn fetch_vpn_credentials_with_temp_pool(
|
||||||
|
config: &Config,
|
||||||
|
paths: &DataPaths,
|
||||||
|
monitoring_handle: &monitoring::MonitoringHandle,
|
||||||
|
) -> Result<Option<Arc<DockerVpnProxyPool>>> {
|
||||||
|
logger::log_info("VPN Rotation Enabled – Fetching latest VPNBook configs").await;
|
||||||
|
|
||||||
|
// Create temp pool
|
||||||
|
logger::log_info("Creating temporary ChromeDriver pool for VPN credential fetch...").await;
|
||||||
|
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
|
||||||
|
None,
|
||||||
|
config,
|
||||||
|
Some(monitoring_handle.clone())
|
||||||
|
).await?);
|
||||||
|
|
||||||
|
// Fetch credentials
|
||||||
|
logger::log_info("Fetching VPNBook credentials...").await;
|
||||||
|
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
||||||
|
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
||||||
|
|
||||||
|
// Cleanup temp pool
|
||||||
|
logger::log_info("Shutting down temporary pool...").await;
|
||||||
|
match temp_pool.shutdown().await {
|
||||||
|
Ok(()) => logger::log_info("✓ Temporary pool shut down successfully").await,
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("✗ Temp pool shutdown error: {}", e)).await;
|
||||||
|
force_kill_chrome_processes().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
// Count VPN servers and create proxy pool
|
||||||
|
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
||||||
|
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if server_count == 0 {
|
||||||
|
logger::log_warn("No VPN servers found – continuing without VPN").await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Found {} VPN servers – starting Docker proxy containers", server_count)).await;
|
||||||
|
|
||||||
|
let number_proxy_instances = config.proxy_instances_per_certificate.unwrap_or(1);
|
||||||
|
let proxy_pool = Arc::new(DockerVpnProxyPool::new(
|
||||||
|
paths.cache_openvpn_dir(),
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
number_proxy_instances
|
||||||
|
).await?);
|
||||||
|
|
||||||
|
logger::log_info(&format!("All {} Docker proxy containers started and ready", proxy_pool.num_proxies())).await;
|
||||||
|
|
||||||
|
// Emit proxy connection events
|
||||||
|
for i in 0..proxy_pool.num_proxies() {
|
||||||
|
if let Some(proxy_info) = proxy_pool.get_proxy_info(i) {
|
||||||
|
monitoring_handle.emit(monitoring::MonitoringEvent::ProxyConnected {
|
||||||
|
container_name: proxy_info.container_name.clone(),
|
||||||
|
ip_address: proxy_info.ip_address.clone(),
|
||||||
|
port: proxy_info.port,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(proxy_pool))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialize monitoring system
|
||||||
|
async fn initialize_monitoring(
|
||||||
|
config: &Config,
|
||||||
|
paths: &DataPaths,
|
||||||
|
) -> Result<(monitoring::MonitoringHandle, tokio::task::JoinHandle<()>)> {
|
||||||
|
let config_snapshot = ConfigSnapshot {
|
||||||
|
max_parallel_instances: config.max_parallel_instances,
|
||||||
|
max_tasks_per_instance: config.max_tasks_per_instance,
|
||||||
|
enable_vpn_rotation: config.enable_vpn_rotation,
|
||||||
|
max_requests_per_session: config.max_requests_per_session,
|
||||||
|
min_request_interval_ms: config.min_request_interval_ms,
|
||||||
|
max_retry_attempts: config.max_retry_attempts,
|
||||||
|
};
|
||||||
|
|
||||||
|
let (monitoring_handle, monitoring_task) = init_monitoring(
|
||||||
|
config_snapshot,
|
||||||
|
paths.logs_dir().to_path_buf(),
|
||||||
|
3030,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
monitoring_handle.emit(monitoring::MonitoringEvent::PoolInitialized {
|
||||||
|
pool_size: config.max_parallel_instances,
|
||||||
|
with_proxy: config.enable_vpn_rotation,
|
||||||
|
with_rotation: config.max_tasks_per_instance > 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
logger::log_info("Monitoring dashboard available at http://localhost:3030").await;
|
||||||
|
|
||||||
|
Ok((monitoring_handle, monitoring_task))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Setup Ctrl+C handler for graceful shutdown
|
||||||
|
fn setup_shutdown_handler(
|
||||||
|
shutdown_flag: Arc<AtomicBool>,
|
||||||
|
pool: Arc<ChromeDriverPool>,
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
|
) {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
tokio::signal::ctrl_c().await.ok();
|
||||||
|
logger::log_info("Ctrl+C received – shutting down gracefully...").await;
|
||||||
|
|
||||||
|
shutdown_flag.store(true, Ordering::SeqCst);
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
perform_full_cleanup(&pool, proxy_pool.as_deref()).await;
|
||||||
|
|
||||||
|
logger::log_info("Shutdown complete").await;
|
||||||
|
std::process::exit(0);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn format_duration(duration: Duration) -> String {
|
||||||
|
let total_seconds = duration.as_secs();
|
||||||
|
|
||||||
|
let days = total_seconds / 86400;
|
||||||
|
let hours = (total_seconds % 86400) / 3600;
|
||||||
|
let minutes = (total_seconds % 3600) / 60;
|
||||||
|
let seconds = total_seconds % 60;
|
||||||
|
|
||||||
|
format!("{:02}::{:02}::{:02}::{:02}", days, hours, minutes, seconds)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_state_file(paths: &DataPaths) -> Result<()> {
|
||||||
|
let integrity_path = paths.integrity_dir().join("state.jsonl");
|
||||||
|
|
||||||
|
// Use OpenOptions to create the file only if it doesn't exist
|
||||||
|
OpenOptions::new()
|
||||||
|
.create(true) // Create if it doesn't exist
|
||||||
|
.write(true) // Ensure we can write to the file
|
||||||
|
.open(&integrity_path)?;
|
||||||
|
logger::log_info(&format!("Checked or created file: {}", integrity_path.display())).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn visualize_checkpoint_dependencies(paths: &DataPaths) -> Result<()> {
|
||||||
|
// Add more detailed error handling
|
||||||
|
match StateManager::new(
|
||||||
|
paths.integrity_dir(),
|
||||||
|
).await {
|
||||||
|
Ok(manager) => {
|
||||||
|
logger::log_info("✓ Dependency configuration loaded successfully").await;
|
||||||
|
manager.print_dependency_graph();
|
||||||
|
|
||||||
|
let dot = manager.get_dependency_config().to_dot();
|
||||||
|
let dot_path = paths.integrity_dir().join("checkpoint_dependencies.dot");
|
||||||
|
std::fs::write(&dot_path, dot)?;
|
||||||
|
|
||||||
|
logger::log_info(&format!("✓ DOT file written to: {}", dot_path.display())).await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("✗ Failed to load dependency config: {}", e)).await;
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// MAIN FUNCTION - Simplified with extracted helpers
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
// Initial setup
|
||||||
|
let start = Instant::now();
|
||||||
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
|
start_docker_desktop().await;
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
create_state_file(&paths).await.ok();
|
||||||
|
visualize_checkpoint_dependencies(&paths).await.ok();
|
||||||
|
|
||||||
|
let config = Config::load().unwrap_or_else(|_| {
|
||||||
|
eprintln!("Using default configuration");
|
||||||
|
Config::default()
|
||||||
|
});
|
||||||
|
|
||||||
|
// Initialize monitoring
|
||||||
|
let (monitoring_handle, _monitoring_task) = initialize_monitoring(&config, &paths).await?;
|
||||||
|
|
||||||
|
// Initialize debug logger
|
||||||
|
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
||||||
|
logger::log_info("=== Economic Webscraper Started ===").await;
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Config → parallel_instances: {}, task_limit: {}, vpn_rotation: {}, proxy_instances_per_certificate: {:?}",
|
||||||
|
config.max_parallel_instances,
|
||||||
|
config.max_tasks_per_instance,
|
||||||
|
config.enable_vpn_rotation,
|
||||||
|
config.proxy_instances_per_certificate
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
let shutdown_flag = Arc::new(AtomicBool::new(false));
|
||||||
|
|
||||||
|
// Fetch VPN credentials and setup proxy pool if enabled
|
||||||
|
let proxy_pool = if config.enable_vpn_rotation {
|
||||||
|
fetch_vpn_credentials_with_temp_pool(&config, &paths, &monitoring_handle).await?
|
||||||
|
} else {
|
||||||
|
logger::log_info("VPN rotation disabled – using direct connection").await;
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create main ChromeDriver pool
|
||||||
|
logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", config.max_parallel_instances)).await;
|
||||||
|
|
||||||
|
let pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
|
||||||
|
proxy_pool.clone(),
|
||||||
|
&config,
|
||||||
|
Some(monitoring_handle.clone())
|
||||||
|
).await?);
|
||||||
|
|
||||||
|
logger::log_info(&format!("ChromeDriver pool ready with {} instances", config.max_parallel_instances)).await;
|
||||||
|
|
||||||
|
// Setup Ctrl+C handler
|
||||||
|
setup_shutdown_handler(
|
||||||
|
Arc::clone(&shutdown_flag),
|
||||||
|
Arc::clone(&pool),
|
||||||
|
proxy_pool.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Run scraping jobs
|
||||||
|
check_shutdown!(&shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
||||||
|
economic::run_full_update(&config, &pool, &shutdown_flag).await?;
|
||||||
|
logger::log_info("Economic update completed").await;
|
||||||
|
|
||||||
|
check_shutdown!(&shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("--- Starting CORPORATE data update ---").await;
|
||||||
|
corporate::run_full_update(&config, &pool, &shutdown_flag).await?;
|
||||||
|
logger::log_info("Corporate update completed").await;
|
||||||
|
|
||||||
|
check_shutdown!(&shutdown_flag);
|
||||||
|
|
||||||
|
// Final cleanup if not already shutting down
|
||||||
|
perform_full_cleanup(&pool, proxy_pool.as_deref()).await;
|
||||||
|
verify_chrome_cleanup().await;
|
||||||
|
|
||||||
|
logger::log_info(&format!("=== Application finished after {} ===", format_duration(start.elapsed()))).await;
|
||||||
|
|
||||||
|
logger::log_info("=== Application finished successfully ===").await;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
830
src/monitoring/dashboard.html
Normal file
830
src/monitoring/dashboard.html
Normal file
@@ -0,0 +1,830 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Scraper Monitoring Dashboard</title>
|
||||||
|
<style>
|
||||||
|
* {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
background: #1a1a1a;
|
||||||
|
color: #f0f0f0;
|
||||||
|
padding: 20px;
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header {
|
||||||
|
text-align: center;
|
||||||
|
padding: 20px;
|
||||||
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||||
|
border-radius: 8px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header h1 {
|
||||||
|
font-size: 28px;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header .uptime {
|
||||||
|
font-size: 14px;
|
||||||
|
opacity: 0.9;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section {
|
||||||
|
background: #2a2a2a;
|
||||||
|
border: 2px solid #444;
|
||||||
|
padding: 15px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
border-radius: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section-title {
|
||||||
|
font-size: 16px;
|
||||||
|
font-weight: bold;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
padding-bottom: 8px;
|
||||||
|
border-bottom: 2px solid #667eea;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Config Section */
|
||||||
|
.config-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(3, 1fr);
|
||||||
|
gap: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.config-item {
|
||||||
|
background: #333;
|
||||||
|
padding: 12px;
|
||||||
|
border-radius: 4px;
|
||||||
|
border-left: 3px solid #667eea;
|
||||||
|
}
|
||||||
|
|
||||||
|
.config-label {
|
||||||
|
color: #888;
|
||||||
|
font-size: 11px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.config-value {
|
||||||
|
color: #4CAF50;
|
||||||
|
font-size: 18px;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Instance Grid */
|
||||||
|
.instance-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(450px, 1fr));
|
||||||
|
gap: 15px;
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box {
|
||||||
|
background: #333;
|
||||||
|
border: 2px solid #555;
|
||||||
|
border-radius: 5px;
|
||||||
|
padding: 0;
|
||||||
|
display: flex;
|
||||||
|
gap: 0;
|
||||||
|
overflow: hidden;
|
||||||
|
transition: border-color 0.3s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-idle {
|
||||||
|
border-color: #666;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-active {
|
||||||
|
border-color: #4CAF50;
|
||||||
|
box-shadow: 0 0 10px rgba(76, 175, 80, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-renewing {
|
||||||
|
border-color: #FF9800;
|
||||||
|
box-shadow: 0 0 10px rgba(255, 152, 0, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-box.status-error {
|
||||||
|
border-color: #f44336;
|
||||||
|
box-shadow: 0 0 10px rgba(244, 67, 54, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-side,
|
||||||
|
.proxy-side {
|
||||||
|
flex: 1;
|
||||||
|
padding: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.instance-side {
|
||||||
|
background: #3a3a3a;
|
||||||
|
border-right: 1px solid #555;
|
||||||
|
}
|
||||||
|
|
||||||
|
.proxy-side {
|
||||||
|
background: #2a3a4a;
|
||||||
|
}
|
||||||
|
|
||||||
|
.side-header {
|
||||||
|
font-weight: bold;
|
||||||
|
font-size: 14px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
padding-bottom: 5px;
|
||||||
|
border-bottom: 1px solid #555;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 8px;
|
||||||
|
border-radius: 3px;
|
||||||
|
font-size: 11px;
|
||||||
|
font-weight: bold;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.idle {
|
||||||
|
background: #666;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.active {
|
||||||
|
background: #4CAF50;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.renewing {
|
||||||
|
background: #FF9800;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-badge.error {
|
||||||
|
background: #f44336;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-row {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding: 4px 0;
|
||||||
|
font-size: 12px;
|
||||||
|
border-bottom: 1px solid #444;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-row:last-child {
|
||||||
|
border-bottom: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-label {
|
||||||
|
color: #888;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-value {
|
||||||
|
color: #4CAF50;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-value.warning {
|
||||||
|
color: #FF9800;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-value.danger {
|
||||||
|
color: #f44336;
|
||||||
|
}
|
||||||
|
|
||||||
|
.current-url {
|
||||||
|
margin-top: 8px;
|
||||||
|
padding-top: 8px;
|
||||||
|
border-top: 1px solid #555;
|
||||||
|
font-size: 11px;
|
||||||
|
color: #aaa;
|
||||||
|
word-wrap: break-word;
|
||||||
|
}
|
||||||
|
|
||||||
|
.no-proxy {
|
||||||
|
text-align: center;
|
||||||
|
color: #666;
|
||||||
|
padding: 30px 10px;
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Global Stats */
|
||||||
|
.stats-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||||
|
gap: 12px;
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-box {
|
||||||
|
background: #333;
|
||||||
|
padding: 15px;
|
||||||
|
border-radius: 5px;
|
||||||
|
text-align: center;
|
||||||
|
border-left: 4px solid #667eea;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-value {
|
||||||
|
font-size: 28px;
|
||||||
|
font-weight: bold;
|
||||||
|
color: #4CAF50;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-label {
|
||||||
|
font-size: 11px;
|
||||||
|
color: #888;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Yahoo Stats */
|
||||||
|
.yahoo-stats-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
||||||
|
gap: 12px;
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.yahoo-stat-box {
|
||||||
|
background: #2a3a4a;
|
||||||
|
padding: 15px;
|
||||||
|
border-radius: 5px;
|
||||||
|
text-align: center;
|
||||||
|
border-left: 4px solid #FF9800;
|
||||||
|
}
|
||||||
|
|
||||||
|
.yahoo-stat-value {
|
||||||
|
font-size: 28px;
|
||||||
|
font-weight: bold;
|
||||||
|
color: #FF9800;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.yahoo-stat-label {
|
||||||
|
font-size: 11px;
|
||||||
|
color: #aaa;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Logs */
|
||||||
|
.log-container {
|
||||||
|
max-height: 300px;
|
||||||
|
overflow-y: auto;
|
||||||
|
background: #1a1a1a;
|
||||||
|
padding: 10px;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-container::-webkit-scrollbar {
|
||||||
|
width: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-container::-webkit-scrollbar-track {
|
||||||
|
background: #2a2a2a;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-container::-webkit-scrollbar-thumb {
|
||||||
|
background: #667eea;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-entry {
|
||||||
|
padding: 4px 0;
|
||||||
|
border-bottom: 1px solid #333;
|
||||||
|
display: flex;
|
||||||
|
gap: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-entry:last-child {
|
||||||
|
border-bottom: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-time {
|
||||||
|
color: #666;
|
||||||
|
font-weight: bold;
|
||||||
|
min-width: 70px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message {
|
||||||
|
flex: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message.info {
|
||||||
|
color: #4CAF50;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message.warn {
|
||||||
|
color: #FF9800;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-message.error {
|
||||||
|
color: #f44336;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-status {
|
||||||
|
position: fixed;
|
||||||
|
top: 20px;
|
||||||
|
right: 20px;
|
||||||
|
padding: 8px 15px;
|
||||||
|
border-radius: 20px;
|
||||||
|
font-size: 12px;
|
||||||
|
font-weight: bold;
|
||||||
|
z-index: 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-status.connected {
|
||||||
|
background: #4CAF50;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-status.disconnected {
|
||||||
|
background: #f44336;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes pulse {
|
||||||
|
0%, 100% {
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
50% {
|
||||||
|
opacity: 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.pulse {
|
||||||
|
animation: pulse 2s infinite;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Yahoo Client Box */
|
||||||
|
.yahoo-client-box {
|
||||||
|
background: #2a3a4a;
|
||||||
|
border: 2px solid #FF9800;
|
||||||
|
border-radius: 5px;
|
||||||
|
padding: 12px;
|
||||||
|
display: flex;
|
||||||
|
gap: 0;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.yahoo-client-side {
|
||||||
|
flex: 1;
|
||||||
|
padding: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.yahoo-client-side.left {
|
||||||
|
background: #3a4a5a;
|
||||||
|
border-right: 1px solid #555;
|
||||||
|
}
|
||||||
|
|
||||||
|
.yahoo-client-side.right {
|
||||||
|
background: #2a3a4a;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="connection-status" id="connection-status">
|
||||||
|
Connecting...
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="header">
|
||||||
|
<h1>🚀 Scraper Monitoring Dashboard</h1>
|
||||||
|
<div class="uptime" id="uptime">Uptime: Loading...</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Config Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">⚙️ CONFIGURATION</div>
|
||||||
|
<div class="config-grid" id="config"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Pool Status Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">🔧 POOL STATUS</div>
|
||||||
|
<div class="instance-grid" id="instances"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Yahoo API Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">📈 YAHOO API METRICS</div>
|
||||||
|
<div class="yahoo-stats-grid" id="yahoo-stats"></div>
|
||||||
|
<div class="instance-grid" id="yahoo-clients"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Global Metrics Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">📊 GLOBAL METRICS</div>
|
||||||
|
<div class="stats-grid" id="global-stats"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Logs Section -->
|
||||||
|
<div class="section">
|
||||||
|
<div class="section-title">📝 RECENT LOGS</div>
|
||||||
|
<div class="log-container" id="logs"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let ws = null;
|
||||||
|
let reconnectInterval = null;
|
||||||
|
|
||||||
|
function connect() {
|
||||||
|
ws = new WebSocket('ws://' + window.location.host + '/ws');
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
console.log('WebSocket connected');
|
||||||
|
updateConnectionStatus(true);
|
||||||
|
if (reconnectInterval) {
|
||||||
|
clearInterval(reconnectInterval);
|
||||||
|
reconnectInterval = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onmessage = (event) => {
|
||||||
|
try {
|
||||||
|
const state = JSON.parse(event.data);
|
||||||
|
updateDashboard(state);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to parse message:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onclose = () => {
|
||||||
|
console.log('WebSocket disconnected');
|
||||||
|
updateConnectionStatus(false);
|
||||||
|
// Attempt to reconnect every 3 seconds
|
||||||
|
if (!reconnectInterval) {
|
||||||
|
reconnectInterval = setInterval(() => {
|
||||||
|
console.log('Attempting to reconnect...');
|
||||||
|
connect();
|
||||||
|
}, 3000);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onerror = (error) => {
|
||||||
|
console.error('WebSocket error:', error);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateConnectionStatus(connected) {
|
||||||
|
const status = document.getElementById('connection-status');
|
||||||
|
if (connected) {
|
||||||
|
status.textContent = '● Connected';
|
||||||
|
status.className = 'connection-status connected';
|
||||||
|
} else {
|
||||||
|
status.textContent = '● Disconnected';
|
||||||
|
status.className = 'connection-status disconnected pulse';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateDashboard(state) {
|
||||||
|
updateConfig(state.config);
|
||||||
|
updateInstances(state.instances);
|
||||||
|
updateGlobalStats(state.global);
|
||||||
|
updateYahooStats(state.global);
|
||||||
|
updateYahooClients(state.yahoo_clients);
|
||||||
|
updateLogs(state.logs);
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateConfig(config) {
|
||||||
|
const container = document.getElementById('config');
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Parallel Instances</div>
|
||||||
|
<div class="config-value">${config.max_parallel_instances}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Tasks per Instance</div>
|
||||||
|
<div class="config-value">${config.max_tasks_per_instance || 'Unlimited'}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">VPN Rotation</div>
|
||||||
|
<div class="config-value">${config.enable_vpn_rotation ? '✓ Enabled' : '✗ Disabled'}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Requests per Session</div>
|
||||||
|
<div class="config-value">${config.max_requests_per_session}</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Min Request Interval</div>
|
||||||
|
<div class="config-value">${config.min_request_interval_ms}ms</div>
|
||||||
|
</div>
|
||||||
|
<div class="config-item">
|
||||||
|
<div class="config-label">Max Retry Attempts</div>
|
||||||
|
<div class="config-value">${config.max_retry_attempts}</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateInstances(instances) {
|
||||||
|
const container = document.getElementById('instances');
|
||||||
|
if (!instances || instances.length === 0) {
|
||||||
|
container.innerHTML = '<div style="text-align: center; padding: 40px; color: #666;">No instances available</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
container.innerHTML = instances.map(inst => {
|
||||||
|
const statusClass = `status-${inst.status}`;
|
||||||
|
const proxy = inst.connected_proxy;
|
||||||
|
|
||||||
|
const successRate = inst.total_requests > 0
|
||||||
|
? ((inst.success_count / inst.total_requests) * 100).toFixed(1)
|
||||||
|
: '0.0';
|
||||||
|
|
||||||
|
const yahooSuccessRate = inst.yahoo_requests > 0
|
||||||
|
? ((inst.yahoo_success / inst.yahoo_requests) * 100).toFixed(1)
|
||||||
|
: '0.0';
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="instance-box ${statusClass}">
|
||||||
|
<div class="instance-side">
|
||||||
|
<div class="side-header">
|
||||||
|
🖥️ Instance #${inst.id}
|
||||||
|
<span class="status-badge ${inst.status}">${inst.status}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Current Tasks</span>
|
||||||
|
<span class="metric-value ${inst.tasks_current_session >= inst.tasks_max ? 'warning' : ''}">
|
||||||
|
${inst.tasks_current_session}/${inst.tasks_max}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Session Requests</span>
|
||||||
|
<span class="metric-value">${inst.session_requests}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Total Requests</span>
|
||||||
|
<span class="metric-value">${inst.total_requests}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Success / Fail</span>
|
||||||
|
<span class="metric-value">${inst.success_count} / ${inst.failure_count}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Success Rate</span>
|
||||||
|
<span class="metric-value ${successRate < 50 ? 'danger' : successRate < 80 ? 'warning' : ''}">
|
||||||
|
${successRate}%
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Yahoo Requests</span>
|
||||||
|
<span class="metric-value">${inst.yahoo_requests}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Yahoo Rate</span>
|
||||||
|
<span class="metric-value ${yahooSuccessRate < 50 ? 'danger' : yahooSuccessRate < 80 ? 'warning' : ''}">
|
||||||
|
${yahooSuccessRate}%
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Last Activity</span>
|
||||||
|
<span class="metric-value">${inst.last_activity}</span>
|
||||||
|
</div>
|
||||||
|
${inst.current_task ? `
|
||||||
|
<div class="current-url">
|
||||||
|
<strong>Current URL:</strong><br>
|
||||||
|
${escapeHtml(inst.current_task)}
|
||||||
|
</div>
|
||||||
|
` : ''}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
${proxy ? `
|
||||||
|
<div class="proxy-side">
|
||||||
|
<div class="side-header">
|
||||||
|
📡 ${proxy.container_name}
|
||||||
|
<span class="status-badge ${proxy.status}">${proxy.status}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">IP Address</span>
|
||||||
|
<span class="metric-value">${proxy.ip_address}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Port</span>
|
||||||
|
<span class="metric-value">${proxy.port}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Status</span>
|
||||||
|
<span class="metric-value">${proxy.status}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
` : `
|
||||||
|
<div class="proxy-side">
|
||||||
|
<div class="no-proxy">
|
||||||
|
🌐<br>
|
||||||
|
Direct Connection<br>
|
||||||
|
(No Proxy)
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateYahooStats(global) {
|
||||||
|
const container = document.getElementById('yahoo-stats');
|
||||||
|
const yahooSuccessRate = global.total_yahoo_requests > 0
|
||||||
|
? ((global.successful_yahoo_requests / global.total_yahoo_requests) * 100).toFixed(1)
|
||||||
|
: '0.0';
|
||||||
|
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="yahoo-stat-box">
|
||||||
|
<div class="yahoo-stat-value">${global.total_yahoo_requests || 0}</div>
|
||||||
|
<div class="yahoo-stat-label">Total Requests</div>
|
||||||
|
</div>
|
||||||
|
<div class="yahoo-stat-box">
|
||||||
|
<div class="yahoo-stat-value">${yahooSuccessRate}%</div>
|
||||||
|
<div class="yahoo-stat-label">Success Rate</div>
|
||||||
|
</div>
|
||||||
|
<div class="yahoo-stat-box">
|
||||||
|
<div class="yahoo-stat-value">${global.successful_yahoo_requests || 0}</div>
|
||||||
|
<div class="yahoo-stat-label">Successful</div>
|
||||||
|
</div>
|
||||||
|
<div class="yahoo-stat-box">
|
||||||
|
<div class="yahoo-stat-value">${global.failed_yahoo_requests || 0}</div>
|
||||||
|
<div class="yahoo-stat-label">Failed</div>
|
||||||
|
</div>
|
||||||
|
<div class="yahoo-stat-box">
|
||||||
|
<div class="yahoo-stat-value">${global.yahoo_client_count || 0}</div>
|
||||||
|
<div class="yahoo-stat-label">Active Clients</div>
|
||||||
|
</div>
|
||||||
|
<div class="yahoo-stat-box">
|
||||||
|
<div class="yahoo-stat-value">${global.yahoo_batch_requests || 0}</div>
|
||||||
|
<div class="yahoo-stat-label">Batch Requests</div>
|
||||||
|
</div>
|
||||||
|
<div class="yahoo-stat-box">
|
||||||
|
<div class="yahoo-stat-value">${global.yahoo_session_renewals || 0}</div>
|
||||||
|
<div class="yahoo-stat-label">Session Renewals</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateYahooClients(yahooClients) {
|
||||||
|
const container = document.getElementById('yahoo-clients');
|
||||||
|
if (!yahooClients || yahooClients.length === 0) {
|
||||||
|
container.innerHTML = '<div style="text-align: center; padding: 40px; color: #666;">No Yahoo clients available</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
container.innerHTML = yahooClients.map(client => {
|
||||||
|
const successRate = client.requests_total > 0
|
||||||
|
? ((client.requests_successful / client.requests_total) * 100).toFixed(1)
|
||||||
|
: '0.0';
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="yahoo-client-box">
|
||||||
|
<div class="yahoo-client-side left">
|
||||||
|
<div class="side-header">
|
||||||
|
📊 Yahoo Client #${client.instance_id}
|
||||||
|
${client.has_proxy ? '🔗' : '🌐'}
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Total Requests</span>
|
||||||
|
<span class="metric-value">${client.requests_total}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Success / Fail</span>
|
||||||
|
<span class="metric-value">${client.requests_successful} / ${client.requests_failed}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Success Rate</span>
|
||||||
|
<span class="metric-value ${successRate < 50 ? 'danger' : successRate < 80 ? 'warning' : ''}">
|
||||||
|
${successRate}%
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Current / Max</span>
|
||||||
|
<span class="metric-value ${client.current_requests >= client.max_requests ? 'danger' : ''}">
|
||||||
|
${client.current_requests} / ${client.max_requests}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Last Activity</span>
|
||||||
|
<span class="metric-value">${client.last_activity}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="yahoo-client-side right">
|
||||||
|
${client.proxy_info ? `
|
||||||
|
<div class="side-header">🔗 ${client.proxy_info.container_name}</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">IP Address</span>
|
||||||
|
<span class="metric-value">${client.proxy_info.ip_address}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Port</span>
|
||||||
|
<span class="metric-value">${client.proxy_info.port}</span>
|
||||||
|
</div>
|
||||||
|
<div class="metric-row">
|
||||||
|
<span class="metric-label">Status</span>
|
||||||
|
<span class="metric-value">${client.proxy_info.status}</span>
|
||||||
|
</div>
|
||||||
|
` : `
|
||||||
|
<div class="no-proxy">
|
||||||
|
${client.has_proxy ? '⚠️' : '🌐'}<br>
|
||||||
|
${client.has_proxy ? 'Proxy Not Connected' : 'Direct Connection'}
|
||||||
|
</div>
|
||||||
|
`}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateGlobalStats(global) {
|
||||||
|
const container = document.getElementById('global-stats');
|
||||||
|
|
||||||
|
const uptime = document.getElementById('uptime');
|
||||||
|
uptime.textContent = `Uptime: ${formatUptime(global.uptime_seconds)}`;
|
||||||
|
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.total_requests}</div>
|
||||||
|
<div class="stat-label">Total Requests</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.success_rate.toFixed(1)}%</div>
|
||||||
|
<div class="stat-label">Success Rate</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.successful_requests}</div>
|
||||||
|
<div class="stat-label">Successful</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.failed_requests}</div>
|
||||||
|
<div class="stat-label">Failed</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.session_renewals}</div>
|
||||||
|
<div class="stat-label">Session Renewals</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.rotation_events}</div>
|
||||||
|
<div class="stat-label">Rotation Events</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.navigation_timeouts}</div>
|
||||||
|
<div class="stat-label">Timeouts</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.bot_detection_hits}</div>
|
||||||
|
<div class="stat-label">Bot Detection</div>
|
||||||
|
</div>
|
||||||
|
<div class="stat-box">
|
||||||
|
<div class="stat-value">${global.proxy_failures}</div>
|
||||||
|
<div class="stat-label">Proxy Failures</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateLogs(logs) {
|
||||||
|
const container = document.getElementById('logs');
|
||||||
|
const wasScrolledToBottom = container.scrollHeight - container.scrollTop === container.clientHeight;
|
||||||
|
|
||||||
|
container.innerHTML = logs.map(log => `
|
||||||
|
<div class="log-entry">
|
||||||
|
<span class="log-time">${log.timestamp}</span>
|
||||||
|
<span class="log-message ${log.level}">${escapeHtml(log.message)}</span>
|
||||||
|
</div>
|
||||||
|
`).join('');
|
||||||
|
|
||||||
|
// Auto-scroll to bottom if user was already at bottom
|
||||||
|
if (wasScrolledToBottom) {
|
||||||
|
container.scrollTop = container.scrollHeight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatUptime(seconds) {
|
||||||
|
const hours = Math.floor(seconds / 3600);
|
||||||
|
const minutes = Math.floor((seconds % 3600) / 60);
|
||||||
|
const secs = seconds % 60;
|
||||||
|
return `${hours}h ${minutes}m ${secs}s`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function escapeHtml(text) {
|
||||||
|
const map = {
|
||||||
|
'&': '&',
|
||||||
|
'<': '<',
|
||||||
|
'>': '>',
|
||||||
|
'"': '"',
|
||||||
|
"'": '''
|
||||||
|
};
|
||||||
|
return text.replace(/[&<>"']/g, m => map[m]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize connection
|
||||||
|
connect();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
173
src/monitoring/events.rs
Normal file
173
src/monitoring/events.rs
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
// src/monitoring/events.rs
|
||||||
|
use super::metrics::ProxyInfo;
|
||||||
|
|
||||||
|
/// Events emitted by the scraper system
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum MonitoringEvent {
|
||||||
|
// Pool initialization
|
||||||
|
PoolInitialized {
|
||||||
|
pool_size: usize,
|
||||||
|
with_proxy: bool,
|
||||||
|
with_rotation: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Instance lifecycle
|
||||||
|
InstanceCreated {
|
||||||
|
instance_id: usize,
|
||||||
|
max_tasks: usize,
|
||||||
|
proxy: Option<ProxyInfo>,
|
||||||
|
},
|
||||||
|
|
||||||
|
InstanceStatusChanged {
|
||||||
|
instance_id: usize,
|
||||||
|
status: InstanceStatusChange,
|
||||||
|
},
|
||||||
|
|
||||||
|
InstanceSelected {
|
||||||
|
instance_id: usize,
|
||||||
|
half: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Task execution
|
||||||
|
TaskStarted {
|
||||||
|
instance_id: usize,
|
||||||
|
url: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
TaskCompleted {
|
||||||
|
instance_id: usize,
|
||||||
|
success: bool,
|
||||||
|
duration_ms: u64,
|
||||||
|
error: Option<String>,
|
||||||
|
},
|
||||||
|
|
||||||
|
NavigationTimeout {
|
||||||
|
instance_id: usize,
|
||||||
|
url: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
BotDetectionTriggered {
|
||||||
|
instance_id: usize,
|
||||||
|
url: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Session management
|
||||||
|
SessionStarted {
|
||||||
|
instance_id: usize,
|
||||||
|
proxy: Option<ProxyInfo>,
|
||||||
|
},
|
||||||
|
|
||||||
|
SessionRenewed {
|
||||||
|
instance_id: usize,
|
||||||
|
old_request_count: usize,
|
||||||
|
reason: RenewalReason,
|
||||||
|
new_proxy: Option<ProxyInfo>,
|
||||||
|
},
|
||||||
|
|
||||||
|
SessionRequestIncremented {
|
||||||
|
instance_id: usize,
|
||||||
|
new_count: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Proxy events
|
||||||
|
ProxyConnected {
|
||||||
|
container_name: String,
|
||||||
|
ip_address: String,
|
||||||
|
port: u16,
|
||||||
|
},
|
||||||
|
|
||||||
|
ProxyFailed {
|
||||||
|
container_name: String,
|
||||||
|
error: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
ProxyRotated {
|
||||||
|
instance_id: usize,
|
||||||
|
old_proxy: Option<String>,
|
||||||
|
new_proxy: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Pool rotation events
|
||||||
|
RotationTriggered {
|
||||||
|
reason: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Yahoo API events
|
||||||
|
YahooRequestStarted {
|
||||||
|
instance_id: usize,
|
||||||
|
endpoint: String,
|
||||||
|
symbol: Option<String>,
|
||||||
|
},
|
||||||
|
|
||||||
|
YahooRequestCompleted {
|
||||||
|
instance_id: usize,
|
||||||
|
success: bool,
|
||||||
|
duration_ms: u64,
|
||||||
|
error: Option<String>,
|
||||||
|
},
|
||||||
|
|
||||||
|
YahooBatchRequestStarted {
|
||||||
|
count: usize,
|
||||||
|
symbols: Vec<String>,
|
||||||
|
endpoint: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
YahooBatchRequestCompleted {
|
||||||
|
successful: usize,
|
||||||
|
failed: usize,
|
||||||
|
total: usize,
|
||||||
|
duration_ms: u64,
|
||||||
|
},
|
||||||
|
|
||||||
|
YahooClientCreated {
|
||||||
|
instance_id: usize,
|
||||||
|
has_proxy: bool,
|
||||||
|
max_requests: u32,
|
||||||
|
},
|
||||||
|
|
||||||
|
YahooClientReset {
|
||||||
|
instance_id: usize,
|
||||||
|
previous_requests: u32,
|
||||||
|
reason: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
// Logging
|
||||||
|
LogMessage {
|
||||||
|
level: LogLevel,
|
||||||
|
message: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum InstanceStatusChange {
|
||||||
|
Idle,
|
||||||
|
Active,
|
||||||
|
Renewing,
|
||||||
|
Error(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum RenewalReason {
|
||||||
|
TaskLimit,
|
||||||
|
RequestLimit,
|
||||||
|
Error,
|
||||||
|
Manual,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum LogLevel {
|
||||||
|
Info,
|
||||||
|
Warn,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for RenewalReason {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
RenewalReason::TaskLimit => write!(f, "task_limit"),
|
||||||
|
RenewalReason::RequestLimit => write!(f, "request_limit"),
|
||||||
|
RenewalReason::Error => write!(f, "error"),
|
||||||
|
RenewalReason::Manual => write!(f, "manual"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
103
src/monitoring/logger.rs
Normal file
103
src/monitoring/logger.rs
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
// src/monitoring/logger.rs
|
||||||
|
use super::metrics::SessionSummary;
|
||||||
|
use chrono::Local;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use tokio::fs::OpenOptions;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
|
/// Logs session summaries to JSONL files
|
||||||
|
pub struct SessionLogger {
|
||||||
|
log_dir: PathBuf,
|
||||||
|
file: Mutex<Option<tokio::fs::File>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SessionLogger {
|
||||||
|
pub fn new(log_dir: PathBuf) -> Self {
|
||||||
|
Self {
|
||||||
|
log_dir,
|
||||||
|
file: Mutex::new(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Log a completed session summary
|
||||||
|
pub async fn log_session(&self, summary: &SessionSummary) {
|
||||||
|
if let Err(e) = self.write_session(summary).await {
|
||||||
|
eprintln!("Failed to log session: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn write_session(&self, summary: &SessionSummary) -> anyhow::Result<()> {
|
||||||
|
let mut file_guard = self.file.lock().await;
|
||||||
|
|
||||||
|
// Open file if not already open
|
||||||
|
if file_guard.is_none() {
|
||||||
|
let filename = format!(
|
||||||
|
"sessions_{}.jsonl",
|
||||||
|
Local::now().format("%Y%m%d")
|
||||||
|
);
|
||||||
|
let filepath = self.log_dir.join(filename);
|
||||||
|
|
||||||
|
tokio::fs::create_dir_all(&self.log_dir).await?;
|
||||||
|
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&filepath)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
*file_guard = Some(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(file) = file_guard.as_mut() {
|
||||||
|
let json_line = serde_json::to_string(summary)?;
|
||||||
|
file.write_all(json_line.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Logs metrics snapshots periodically
|
||||||
|
pub struct MetricsLogger {
|
||||||
|
log_dir: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricsLogger {
|
||||||
|
pub fn new(log_dir: PathBuf) -> Self {
|
||||||
|
Self { log_dir }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Log a metrics snapshot
|
||||||
|
pub async fn log_metrics(&self, state: &super::metrics::DashboardState) -> anyhow::Result<()> {
|
||||||
|
let filename = format!(
|
||||||
|
"metrics_{}.jsonl",
|
||||||
|
Local::now().format("%Y%m%d")
|
||||||
|
);
|
||||||
|
let filepath = self.log_dir.join(filename);
|
||||||
|
|
||||||
|
tokio::fs::create_dir_all(&self.log_dir).await?;
|
||||||
|
|
||||||
|
let mut file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&filepath)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let snapshot = serde_json::json!({
|
||||||
|
"timestamp": Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||||
|
"global": state.global,
|
||||||
|
"instance_count": state.instances.len(),
|
||||||
|
"proxy_count": state.proxies.len(),
|
||||||
|
});
|
||||||
|
|
||||||
|
let json_line = serde_json::to_string(&snapshot)?;
|
||||||
|
file.write_all(json_line.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
361
src/monitoring/metrics.rs
Normal file
361
src/monitoring/metrics.rs
Normal file
@@ -0,0 +1,361 @@
|
|||||||
|
// src/monitoring/metrics.rs
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
/// Complete dashboard state sent to web clients
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct DashboardState {
|
||||||
|
pub config: ConfigSnapshot,
|
||||||
|
pub instances: Vec<InstanceMetrics>,
|
||||||
|
pub proxies: Vec<ProxyMetrics>,
|
||||||
|
pub yahoo_clients: Vec<YahooClientMetrics>,
|
||||||
|
pub global: GlobalMetrics,
|
||||||
|
pub logs: Vec<LogEntry>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Snapshot of configuration settings
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ConfigSnapshot {
|
||||||
|
pub max_parallel_instances: usize,
|
||||||
|
pub max_tasks_per_instance: usize,
|
||||||
|
pub enable_vpn_rotation: bool,
|
||||||
|
pub max_requests_per_session: usize,
|
||||||
|
pub min_request_interval_ms: u64,
|
||||||
|
pub max_retry_attempts: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics for a single ChromeDriver instance
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct InstanceMetrics {
|
||||||
|
pub id: usize,
|
||||||
|
pub status: InstanceStatus,
|
||||||
|
pub current_task: Option<String>,
|
||||||
|
pub tasks_current_session: usize,
|
||||||
|
pub tasks_max: usize,
|
||||||
|
pub session_requests: usize,
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub success_count: usize,
|
||||||
|
pub failure_count: usize,
|
||||||
|
pub connected_proxy: Option<ProxyInfo>,
|
||||||
|
pub last_activity: String, // Timestamp
|
||||||
|
pub yahoo_requests: usize,
|
||||||
|
pub yahoo_success: usize,
|
||||||
|
pub yahoo_failures: usize,
|
||||||
|
pub yahoo_success_rate: f64,
|
||||||
|
pub yahoo_current_requests: u32,
|
||||||
|
pub yahoo_max_requests: u32,
|
||||||
|
pub yahoo_last_endpoint: Option<String>,
|
||||||
|
pub yahoo_last_symbol: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum InstanceStatus {
|
||||||
|
Idle,
|
||||||
|
Active,
|
||||||
|
Renewing,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Information about a proxy connection
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ProxyInfo {
|
||||||
|
pub container_name: String,
|
||||||
|
pub ip_address: String,
|
||||||
|
pub port: u16,
|
||||||
|
pub status: ProxyStatus,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum ProxyStatus {
|
||||||
|
Connected,
|
||||||
|
Disconnected,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics for a proxy
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ProxyMetrics {
|
||||||
|
pub container_name: String,
|
||||||
|
pub ip_address: String,
|
||||||
|
pub port: u16,
|
||||||
|
pub status: ProxyStatus,
|
||||||
|
pub instances_using: Vec<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics for a Yahoo client
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct YahooClientMetrics {
|
||||||
|
pub instance_id: usize,
|
||||||
|
pub requests_total: usize,
|
||||||
|
pub requests_successful: usize,
|
||||||
|
pub requests_failed: usize,
|
||||||
|
pub current_requests: u32,
|
||||||
|
pub max_requests: u32,
|
||||||
|
pub has_proxy: bool,
|
||||||
|
pub last_activity: String,
|
||||||
|
pub proxy_info: Option<ProxyInfo>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Global pool metrics
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct GlobalMetrics {
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub successful_requests: usize,
|
||||||
|
pub failed_requests: usize,
|
||||||
|
pub success_rate: f64,
|
||||||
|
pub session_renewals: usize,
|
||||||
|
pub rotation_events: usize,
|
||||||
|
pub navigation_timeouts: usize,
|
||||||
|
pub bot_detection_hits: usize,
|
||||||
|
pub proxy_failures: usize,
|
||||||
|
pub uptime_seconds: u64,
|
||||||
|
pub total_yahoo_requests: usize,
|
||||||
|
pub successful_yahoo_requests: usize,
|
||||||
|
pub failed_yahoo_requests: usize,
|
||||||
|
pub yahoo_success_rate: f64,
|
||||||
|
pub yahoo_batch_requests: usize,
|
||||||
|
pub yahoo_session_renewals: usize,
|
||||||
|
pub yahoo_client_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Log entry for display in dashboard
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct LogEntry {
|
||||||
|
pub timestamp: String,
|
||||||
|
pub level: LogLevel,
|
||||||
|
pub message: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum LogLevel {
|
||||||
|
Info,
|
||||||
|
Warn,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal state tracked by monitoring service
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct MonitoringState {
|
||||||
|
pub instances: HashMap<usize, InstanceState>,
|
||||||
|
pub proxies: HashMap<String, ProxyState>,
|
||||||
|
pub yahoo_clients: HashMap<usize, YahooClientState>,
|
||||||
|
pub global: GlobalState,
|
||||||
|
pub start_time: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct InstanceState {
|
||||||
|
pub id: usize,
|
||||||
|
pub status: InstanceStatus,
|
||||||
|
pub current_task: Option<String>,
|
||||||
|
pub tasks_current_session: usize,
|
||||||
|
pub tasks_max: usize,
|
||||||
|
pub session_requests: usize,
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub success_count: usize,
|
||||||
|
pub failure_count: usize,
|
||||||
|
pub connected_proxy: Option<ProxyInfo>,
|
||||||
|
pub last_activity: Instant,
|
||||||
|
pub yahoo_requests: usize,
|
||||||
|
pub yahoo_success: usize,
|
||||||
|
pub yahoo_failures: usize,
|
||||||
|
pub yahoo_current_requests: u32,
|
||||||
|
pub yahoo_max_requests: u32,
|
||||||
|
pub yahoo_last_endpoint: Option<String>,
|
||||||
|
pub yahoo_last_symbol: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ProxyState {
|
||||||
|
pub container_name: String,
|
||||||
|
pub ip_address: String,
|
||||||
|
pub port: u16,
|
||||||
|
pub status: ProxyStatus,
|
||||||
|
pub instances_using: Vec<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct YahooClientState {
|
||||||
|
pub instance_id: usize,
|
||||||
|
pub requests_total: usize,
|
||||||
|
pub requests_successful: usize,
|
||||||
|
pub requests_failed: usize,
|
||||||
|
pub current_requests: u32,
|
||||||
|
pub max_requests: u32,
|
||||||
|
pub has_proxy: bool,
|
||||||
|
pub last_activity: Instant,
|
||||||
|
pub proxy_info: Option<ProxyInfo>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct GlobalState {
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub successful_requests: usize,
|
||||||
|
pub failed_requests: usize,
|
||||||
|
pub session_renewals: usize,
|
||||||
|
pub rotation_events: usize,
|
||||||
|
pub navigation_timeouts: usize,
|
||||||
|
pub bot_detection_hits: usize,
|
||||||
|
pub proxy_failures: usize,
|
||||||
|
pub total_yahoo_requests: usize,
|
||||||
|
pub successful_yahoo_requests: usize,
|
||||||
|
pub failed_yahoo_requests: usize,
|
||||||
|
pub yahoo_batch_requests: usize,
|
||||||
|
pub yahoo_session_renewals: usize,
|
||||||
|
pub yahoo_client_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MonitoringState {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
instances: HashMap::new(),
|
||||||
|
proxies: HashMap::new(),
|
||||||
|
yahoo_clients: HashMap::new(),
|
||||||
|
global: GlobalState {
|
||||||
|
total_requests: 0,
|
||||||
|
successful_requests: 0,
|
||||||
|
failed_requests: 0,
|
||||||
|
session_renewals: 0,
|
||||||
|
rotation_events: 0,
|
||||||
|
navigation_timeouts: 0,
|
||||||
|
bot_detection_hits: 0,
|
||||||
|
proxy_failures: 0,
|
||||||
|
total_yahoo_requests: 0,
|
||||||
|
successful_yahoo_requests: 0,
|
||||||
|
failed_yahoo_requests: 0,
|
||||||
|
yahoo_batch_requests: 0,
|
||||||
|
yahoo_session_renewals: 0,
|
||||||
|
yahoo_client_count: 0,
|
||||||
|
},
|
||||||
|
start_time: Instant::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert internal state to dashboard state for web clients
|
||||||
|
pub fn to_dashboard_state(&self, config: ConfigSnapshot, logs: Vec<LogEntry>) -> DashboardState {
|
||||||
|
let instances: Vec<InstanceMetrics> = self
|
||||||
|
.instances
|
||||||
|
.values()
|
||||||
|
.map(|inst| {
|
||||||
|
let yahoo_success_rate = if inst.yahoo_success + inst.yahoo_failures > 0 {
|
||||||
|
(inst.yahoo_success as f64 / (inst.yahoo_success + inst.yahoo_failures) as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
InstanceMetrics {
|
||||||
|
id: inst.id,
|
||||||
|
status: inst.status.clone(),
|
||||||
|
current_task: inst.current_task.clone(),
|
||||||
|
tasks_current_session: inst.tasks_current_session,
|
||||||
|
tasks_max: inst.tasks_max,
|
||||||
|
session_requests: inst.session_requests,
|
||||||
|
total_requests: inst.total_requests,
|
||||||
|
success_count: inst.success_count,
|
||||||
|
failure_count: inst.failure_count,
|
||||||
|
connected_proxy: inst.connected_proxy.clone(),
|
||||||
|
last_activity: format_timestamp(inst.last_activity),
|
||||||
|
yahoo_requests: inst.yahoo_requests,
|
||||||
|
yahoo_success: inst.yahoo_success,
|
||||||
|
yahoo_failures: inst.yahoo_failures,
|
||||||
|
yahoo_success_rate,
|
||||||
|
yahoo_current_requests: inst.yahoo_current_requests,
|
||||||
|
yahoo_max_requests: inst.yahoo_max_requests,
|
||||||
|
yahoo_last_endpoint: inst.yahoo_last_endpoint.clone(),
|
||||||
|
yahoo_last_symbol: inst.yahoo_last_symbol.clone(),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let proxies: Vec<ProxyMetrics> = self
|
||||||
|
.proxies
|
||||||
|
.values()
|
||||||
|
.map(|proxy| ProxyMetrics {
|
||||||
|
container_name: proxy.container_name.clone(),
|
||||||
|
ip_address: proxy.ip_address.clone(),
|
||||||
|
port: proxy.port,
|
||||||
|
status: proxy.status.clone(),
|
||||||
|
instances_using: proxy.instances_using.clone(),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let yahoo_clients: Vec<YahooClientMetrics> = self
|
||||||
|
.yahoo_clients
|
||||||
|
.values()
|
||||||
|
.map(|client| YahooClientMetrics {
|
||||||
|
instance_id: client.instance_id,
|
||||||
|
requests_total: client.requests_total,
|
||||||
|
requests_successful: client.requests_successful,
|
||||||
|
requests_failed: client.requests_failed,
|
||||||
|
current_requests: client.current_requests,
|
||||||
|
max_requests: client.max_requests,
|
||||||
|
has_proxy: client.has_proxy,
|
||||||
|
last_activity: format_timestamp(client.last_activity),
|
||||||
|
proxy_info: client.proxy_info.clone(),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let success_rate = if self.global.total_requests > 0 {
|
||||||
|
(self.global.successful_requests as f64 / self.global.total_requests as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
let yahoo_success_rate = if self.global.total_yahoo_requests > 0 {
|
||||||
|
(self.global.successful_yahoo_requests as f64 / self.global.total_yahoo_requests as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
let global = GlobalMetrics {
|
||||||
|
total_requests: self.global.total_requests,
|
||||||
|
successful_requests: self.global.successful_requests,
|
||||||
|
failed_requests: self.global.failed_requests,
|
||||||
|
success_rate,
|
||||||
|
session_renewals: self.global.session_renewals,
|
||||||
|
rotation_events: self.global.rotation_events,
|
||||||
|
navigation_timeouts: self.global.navigation_timeouts,
|
||||||
|
bot_detection_hits: self.global.bot_detection_hits,
|
||||||
|
proxy_failures: self.global.proxy_failures,
|
||||||
|
uptime_seconds: self.start_time.elapsed().as_secs(),
|
||||||
|
total_yahoo_requests: self.global.total_yahoo_requests,
|
||||||
|
successful_yahoo_requests: self.global.successful_yahoo_requests,
|
||||||
|
failed_yahoo_requests: self.global.failed_yahoo_requests,
|
||||||
|
yahoo_success_rate,
|
||||||
|
yahoo_batch_requests: self.global.yahoo_batch_requests,
|
||||||
|
yahoo_session_renewals: self.global.yahoo_session_renewals,
|
||||||
|
yahoo_client_count: self.global.yahoo_client_count,
|
||||||
|
};
|
||||||
|
|
||||||
|
DashboardState {
|
||||||
|
config,
|
||||||
|
instances,
|
||||||
|
proxies,
|
||||||
|
yahoo_clients,
|
||||||
|
global,
|
||||||
|
logs,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn format_timestamp(instant: Instant) -> String {
|
||||||
|
use chrono::Local;
|
||||||
|
Local::now().format("%H:%M:%S").to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Session completion summary for logging
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct SessionSummary {
|
||||||
|
pub instance_id: usize,
|
||||||
|
pub session_start: String,
|
||||||
|
pub session_end: String,
|
||||||
|
pub duration_seconds: u64,
|
||||||
|
pub total_requests: usize,
|
||||||
|
pub successful_requests: usize,
|
||||||
|
pub failed_requests: usize,
|
||||||
|
pub proxy_info: Option<ProxyInfo>,
|
||||||
|
pub renewal_reason: String, // "task_limit", "request_limit", "error"
|
||||||
|
}
|
||||||
78
src/monitoring/mod.rs
Normal file
78
src/monitoring/mod.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
// src/monitoring/mod.rs
|
||||||
|
//! Monitoring system for tracking scraper performance and health
|
||||||
|
//!
|
||||||
|
//! This module provides:
|
||||||
|
//! - Real-time metrics collection
|
||||||
|
//! - Web-based dashboard
|
||||||
|
//! - Session logging
|
||||||
|
//! - Minimal performance overhead
|
||||||
|
|
||||||
|
pub mod metrics;
|
||||||
|
pub mod events;
|
||||||
|
pub mod service;
|
||||||
|
pub mod webserver;
|
||||||
|
pub mod logger;
|
||||||
|
|
||||||
|
pub use events::{MonitoringEvent,RenewalReason, InstanceStatusChange};
|
||||||
|
pub use metrics::{ConfigSnapshot, ProxyInfo, ProxyStatus};
|
||||||
|
pub use service::{MonitoringService, MonitoringHandle};
|
||||||
|
pub use webserver::WebServer;
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::{mpsc, RwLock};
|
||||||
|
|
||||||
|
/// Initialize the complete monitoring system
|
||||||
|
pub async fn init_monitoring(
|
||||||
|
config_snapshot: ConfigSnapshot,
|
||||||
|
log_dir: PathBuf,
|
||||||
|
dashboard_port: u16,
|
||||||
|
) -> anyhow::Result<(MonitoringHandle, tokio::task::JoinHandle<()>)> {
|
||||||
|
// Create channel for events
|
||||||
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
|
|
||||||
|
// Create monitoring service
|
||||||
|
let service = MonitoringService::new(config_snapshot, rx, log_dir);
|
||||||
|
let service_arc = Arc::new(RwLock::new(service));
|
||||||
|
|
||||||
|
// Start monitoring service task
|
||||||
|
let service_clone = Arc::clone(&service_arc);
|
||||||
|
let monitoring_task = tokio::spawn(async move {
|
||||||
|
println!("🚀 MONITORING TASK STARTED!");
|
||||||
|
// Take ownership of the service
|
||||||
|
let mut service = {
|
||||||
|
let mut guard = service_clone.write().await;
|
||||||
|
std::mem::replace(
|
||||||
|
&mut *guard,
|
||||||
|
MonitoringService::new(
|
||||||
|
ConfigSnapshot {
|
||||||
|
max_parallel_instances: 0,
|
||||||
|
max_tasks_per_instance: 0,
|
||||||
|
enable_vpn_rotation: false,
|
||||||
|
max_requests_per_session: 0,
|
||||||
|
min_request_interval_ms: 0,
|
||||||
|
max_retry_attempts: 0,
|
||||||
|
},
|
||||||
|
mpsc::unbounded_channel().1,
|
||||||
|
PathBuf::new(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("✅ ABOUT TO RUN SERVICE!");
|
||||||
|
service.run().await;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Start web server
|
||||||
|
let webserver = WebServer::new(Arc::clone(&service_arc), dashboard_port);
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = webserver.run().await {
|
||||||
|
eprintln!("Web server error: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Create handle for emitting events
|
||||||
|
let handle = MonitoringHandle::new(tx);
|
||||||
|
|
||||||
|
Ok((handle, monitoring_task))
|
||||||
|
}
|
||||||
511
src/monitoring/service.rs
Normal file
511
src/monitoring/service.rs
Normal file
@@ -0,0 +1,511 @@
|
|||||||
|
// src/monitoring/service.rs
|
||||||
|
use super::events::*;
|
||||||
|
use super::metrics::*;
|
||||||
|
use super::logger::SessionLogger;
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Instant;
|
||||||
|
use tokio::sync::{mpsc, RwLock};
|
||||||
|
use chrono::Local;
|
||||||
|
|
||||||
|
const MAX_LOGS: usize = 100;
|
||||||
|
|
||||||
|
/// Monitoring service that collects events and maintains state
|
||||||
|
pub struct MonitoringService {
|
||||||
|
state: Arc<RwLock<MonitoringState>>,
|
||||||
|
config: ConfigSnapshot,
|
||||||
|
logs: Arc<RwLock<VecDeque<LogEntry>>>,
|
||||||
|
session_logger: Arc<SessionLogger>,
|
||||||
|
event_rx: mpsc::UnboundedReceiver<MonitoringEvent>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MonitoringService {
|
||||||
|
pub fn new(
|
||||||
|
config: ConfigSnapshot,
|
||||||
|
event_rx: mpsc::UnboundedReceiver<MonitoringEvent>,
|
||||||
|
log_dir: std::path::PathBuf,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
state: Arc::new(RwLock::new(MonitoringState::new())),
|
||||||
|
config,
|
||||||
|
logs: Arc::new(RwLock::new(VecDeque::with_capacity(MAX_LOGS))),
|
||||||
|
session_logger: Arc::new(SessionLogger::new(log_dir)),
|
||||||
|
event_rx,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current dashboard state for web clients
|
||||||
|
pub async fn get_dashboard_state(&self) -> DashboardState {
|
||||||
|
let state = self.state.read().await;
|
||||||
|
let logs = self.logs.read().await;
|
||||||
|
state.to_dashboard_state(
|
||||||
|
self.config.clone(),
|
||||||
|
logs.iter().cloned().collect(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Main event processing loop
|
||||||
|
pub async fn run(mut self) {
|
||||||
|
while let Some(event) = self.event_rx.recv().await {
|
||||||
|
self.process_event(event).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn process_event(&self, event: MonitoringEvent) {
|
||||||
|
match event {
|
||||||
|
MonitoringEvent::PoolInitialized { pool_size, with_proxy, with_rotation } => {
|
||||||
|
self.log_info(format!(
|
||||||
|
"Pool initialized: {} instances, proxy={}, rotation={}",
|
||||||
|
pool_size, with_proxy, with_rotation
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::InstanceCreated { instance_id, max_tasks, proxy } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.instances.insert(
|
||||||
|
instance_id,
|
||||||
|
InstanceState {
|
||||||
|
id: instance_id,
|
||||||
|
status: InstanceStatus::Idle,
|
||||||
|
current_task: None,
|
||||||
|
tasks_current_session: 0,
|
||||||
|
tasks_max: max_tasks,
|
||||||
|
session_requests: 0,
|
||||||
|
total_requests: 0,
|
||||||
|
success_count: 0,
|
||||||
|
failure_count: 0,
|
||||||
|
connected_proxy: proxy.clone(),
|
||||||
|
last_activity: Instant::now(),
|
||||||
|
yahoo_requests: 0,
|
||||||
|
yahoo_success: 0,
|
||||||
|
yahoo_failures: 0,
|
||||||
|
yahoo_current_requests: 0,
|
||||||
|
yahoo_max_requests: 0,
|
||||||
|
yahoo_last_endpoint: None,
|
||||||
|
yahoo_last_symbol: None,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Some(proxy_info) = proxy {
|
||||||
|
state.proxies.entry(proxy_info.container_name.clone()).or_insert_with(|| {
|
||||||
|
ProxyState {
|
||||||
|
container_name: proxy_info.container_name.clone(),
|
||||||
|
ip_address: proxy_info.ip_address.clone(),
|
||||||
|
port: proxy_info.port,
|
||||||
|
status: ProxyStatus::Connected,
|
||||||
|
instances_using: vec![instance_id],
|
||||||
|
}
|
||||||
|
}).instances_using.push(instance_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.log_info(format!("Instance #{} created", instance_id)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::InstanceStatusChanged { instance_id, status } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.status = match status {
|
||||||
|
InstanceStatusChange::Idle => InstanceStatus::Idle,
|
||||||
|
InstanceStatusChange::Active => InstanceStatus::Active,
|
||||||
|
InstanceStatusChange::Renewing => InstanceStatus::Renewing,
|
||||||
|
InstanceStatusChange::Error(_) => InstanceStatus::Error,
|
||||||
|
};
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::InstanceSelected { instance_id, half } => {
|
||||||
|
self.log_info(format!("Instance #{} selected (half {})", instance_id, half)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::TaskStarted { instance_id, url } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.status = InstanceStatus::Active;
|
||||||
|
inst.current_task = Some(url.clone());
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
state.global.total_requests += 1;
|
||||||
|
|
||||||
|
self.log_info(format!("Instance #{} started task: {}", instance_id, url)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::TaskCompleted { instance_id, success, duration_ms, error } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.current_task = None;
|
||||||
|
inst.status = InstanceStatus::Idle;
|
||||||
|
inst.total_requests += 1;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
|
||||||
|
if success {
|
||||||
|
inst.success_count += 1;
|
||||||
|
state.global.successful_requests += 1;
|
||||||
|
} else {
|
||||||
|
inst.failure_count += 1;
|
||||||
|
state.global.failed_requests += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if success {
|
||||||
|
self.log_info(format!(
|
||||||
|
"Instance #{} completed task in {}ms",
|
||||||
|
instance_id, duration_ms
|
||||||
|
)).await;
|
||||||
|
} else {
|
||||||
|
self.log_error(format!(
|
||||||
|
"Instance #{} failed task: {}",
|
||||||
|
instance_id,
|
||||||
|
error.unwrap_or_else(|| "unknown error".to_string())
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::NavigationTimeout { instance_id, url } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.navigation_timeouts += 1;
|
||||||
|
|
||||||
|
self.log_warn(format!(
|
||||||
|
"Instance #{} navigation timeout: {}",
|
||||||
|
instance_id, url
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::BotDetectionTriggered { instance_id, url } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.bot_detection_hits += 1;
|
||||||
|
|
||||||
|
self.log_warn(format!(
|
||||||
|
"Instance #{} bot detection triggered: {}",
|
||||||
|
instance_id, url
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::SessionStarted { instance_id, proxy } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.session_requests = 0;
|
||||||
|
inst.tasks_current_session = 0;
|
||||||
|
inst.connected_proxy = proxy;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.log_info(format!("Instance #{} started new session", instance_id)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::SessionRenewed { instance_id, old_request_count, reason, new_proxy } => {
|
||||||
|
// Log the completed session
|
||||||
|
let session_summary = {
|
||||||
|
let state = self.state.read().await;
|
||||||
|
if let Some(inst) = state.instances.get(&instance_id) {
|
||||||
|
Some(SessionSummary {
|
||||||
|
instance_id,
|
||||||
|
session_start: "N/A".to_string(),
|
||||||
|
session_end: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||||
|
duration_seconds: 0,
|
||||||
|
total_requests: old_request_count,
|
||||||
|
successful_requests: inst.success_count,
|
||||||
|
failed_requests: inst.failure_count,
|
||||||
|
proxy_info: inst.connected_proxy.clone(),
|
||||||
|
renewal_reason: reason.to_string(),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(summary) = session_summary {
|
||||||
|
self.session_logger.log_session(&summary).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update state for new session
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.session_requests = 0;
|
||||||
|
inst.tasks_current_session = 0;
|
||||||
|
inst.connected_proxy = new_proxy;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
state.global.session_renewals += 1;
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"Instance #{} renewed session (reason: {}, {} requests)",
|
||||||
|
instance_id, reason, old_request_count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::SessionRequestIncremented { instance_id, new_count } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.session_requests = new_count;
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::ProxyConnected { container_name, ip_address, port } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.proxies.insert(
|
||||||
|
container_name.clone(),
|
||||||
|
ProxyState {
|
||||||
|
container_name: container_name.clone(),
|
||||||
|
ip_address: ip_address.clone(),
|
||||||
|
port,
|
||||||
|
status: ProxyStatus::Connected,
|
||||||
|
instances_using: vec![],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"Proxy {} connected: {}:{}",
|
||||||
|
container_name, ip_address, port
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::ProxyFailed { container_name, error } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
if let Some(proxy) = state.proxies.get_mut(&container_name) {
|
||||||
|
proxy.status = ProxyStatus::Disconnected;
|
||||||
|
}
|
||||||
|
state.global.proxy_failures += 1;
|
||||||
|
|
||||||
|
self.log_error(format!(
|
||||||
|
"Proxy {} failed: {}",
|
||||||
|
container_name, error
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::ProxyRotated { instance_id, old_proxy, new_proxy } => {
|
||||||
|
self.log_info(format!(
|
||||||
|
"Instance #{} rotated proxy: {} -> {}",
|
||||||
|
instance_id,
|
||||||
|
old_proxy.unwrap_or_else(|| "none".to_string()),
|
||||||
|
new_proxy
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::RotationTriggered { reason } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.rotation_events += 1;
|
||||||
|
|
||||||
|
self.log_info(format!("Pool rotation triggered: {}", reason)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Yahoo API Events
|
||||||
|
MonitoringEvent::YahooRequestStarted { instance_id, endpoint, symbol } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
|
||||||
|
// Update global Yahoo stats
|
||||||
|
state.global.total_yahoo_requests += 1;
|
||||||
|
|
||||||
|
// Update instance stats
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.yahoo_requests += 1;
|
||||||
|
inst.yahoo_current_requests += 1;
|
||||||
|
inst.yahoo_last_endpoint = Some(endpoint.clone());
|
||||||
|
inst.yahoo_last_symbol = symbol.clone();
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update Yahoo client stats
|
||||||
|
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
|
||||||
|
client.requests_total += 1;
|
||||||
|
client.current_requests += 1;
|
||||||
|
client.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"YahooClient[{}] started request: {} {}",
|
||||||
|
instance_id,
|
||||||
|
endpoint,
|
||||||
|
symbol.unwrap_or_else(|| "search".to_string())
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::YahooRequestCompleted { instance_id, success, duration_ms, error } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
|
||||||
|
// Update global Yahoo stats
|
||||||
|
if success {
|
||||||
|
state.global.successful_yahoo_requests += 1;
|
||||||
|
} else {
|
||||||
|
state.global.failed_yahoo_requests += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update instance stats
|
||||||
|
if let Some(inst) = state.instances.get_mut(&instance_id) {
|
||||||
|
inst.yahoo_current_requests = inst.yahoo_current_requests.saturating_sub(1);
|
||||||
|
if success {
|
||||||
|
inst.yahoo_success += 1;
|
||||||
|
} else {
|
||||||
|
inst.yahoo_failures += 1;
|
||||||
|
}
|
||||||
|
inst.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update Yahoo client stats
|
||||||
|
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
|
||||||
|
client.current_requests = client.current_requests.saturating_sub(1);
|
||||||
|
if success {
|
||||||
|
client.requests_successful += 1;
|
||||||
|
} else {
|
||||||
|
client.requests_failed += 1;
|
||||||
|
}
|
||||||
|
client.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
if success {
|
||||||
|
self.log_info(format!(
|
||||||
|
"YahooClient[{}] completed request in {}ms",
|
||||||
|
instance_id, duration_ms
|
||||||
|
)).await;
|
||||||
|
} else {
|
||||||
|
self.log_error(format!(
|
||||||
|
"YahooClient[{}] failed request in {}ms: {}",
|
||||||
|
instance_id,
|
||||||
|
duration_ms,
|
||||||
|
error.unwrap_or_else(|| "unknown error".to_string())
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::YahooBatchRequestStarted { count, symbols, endpoint } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.yahoo_batch_requests += 1;
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"Yahoo batch request started: {} symbols, endpoint: {}",
|
||||||
|
count, endpoint
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
if !symbols.is_empty() {
|
||||||
|
self.log_debug(format!(
|
||||||
|
"Batch symbols: {}",
|
||||||
|
symbols.join(", ")
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::YahooBatchRequestCompleted { successful, failed, total, duration_ms } => {
|
||||||
|
let success_rate = if total > 0 {
|
||||||
|
(successful as f64 / total as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"Yahoo batch completed: {}/{} successful ({:.1}%) in {}ms",
|
||||||
|
successful, total, success_rate, duration_ms
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::YahooClientCreated { instance_id, has_proxy, max_requests } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.yahoo_client_count += 1;
|
||||||
|
|
||||||
|
state.yahoo_clients.insert(
|
||||||
|
instance_id,
|
||||||
|
YahooClientState {
|
||||||
|
instance_id,
|
||||||
|
requests_total: 0,
|
||||||
|
requests_successful: 0,
|
||||||
|
requests_failed: 0,
|
||||||
|
current_requests: 0,
|
||||||
|
max_requests,
|
||||||
|
has_proxy,
|
||||||
|
last_activity: Instant::now(),
|
||||||
|
proxy_info: None,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"YahooClient[{}] created (proxy: {}, max requests: {})",
|
||||||
|
instance_id, has_proxy, max_requests
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::YahooClientReset { instance_id, previous_requests, reason } => {
|
||||||
|
let mut state = self.state.write().await;
|
||||||
|
state.global.yahoo_session_renewals += 1;
|
||||||
|
|
||||||
|
if let Some(client) = state.yahoo_clients.get_mut(&instance_id) {
|
||||||
|
client.current_requests = 0;
|
||||||
|
client.last_activity = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.log_info(format!(
|
||||||
|
"YahooClient[{}] reset (had {} requests, reason: {})",
|
||||||
|
instance_id, previous_requests, reason
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
MonitoringEvent::LogMessage { level, message } => {
|
||||||
|
match level {
|
||||||
|
crate::monitoring::events::LogLevel::Info => self.log_info(message).await,
|
||||||
|
crate::monitoring::events::LogLevel::Warn => self.log_warn(message).await,
|
||||||
|
crate::monitoring::events::LogLevel::Error => self.log_error(message).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log_info(&self, message: String) {
|
||||||
|
self.add_log(LogEntry {
|
||||||
|
timestamp: Local::now().format("%H:%M:%S").to_string(),
|
||||||
|
level: super::metrics::LogLevel::Info,
|
||||||
|
message,
|
||||||
|
}).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log_warn(&self, message: String) {
|
||||||
|
self.add_log(LogEntry {
|
||||||
|
timestamp: Local::now().format("%H:%M:%S").to_string(),
|
||||||
|
level: super::metrics::LogLevel::Warn,
|
||||||
|
message,
|
||||||
|
}).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log_error(&self, message: String) {
|
||||||
|
self.add_log(LogEntry {
|
||||||
|
timestamp: Local::now().format("%H:%M:%S").to_string(),
|
||||||
|
level: super::metrics::LogLevel::Error,
|
||||||
|
message,
|
||||||
|
}).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log_debug(&self, message: String) {
|
||||||
|
// Only log debug if DEBUG_LOGGING is enabled
|
||||||
|
if std::env::var("DEBUG_LOGGING").is_ok() {
|
||||||
|
self.add_log(LogEntry {
|
||||||
|
timestamp: Local::now().format("%H:%M:%S").to_string(),
|
||||||
|
level: super::metrics::LogLevel::Info,
|
||||||
|
message: format!("[DEBUG] {}", message),
|
||||||
|
}).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn add_log(&self, entry: LogEntry) {
|
||||||
|
let mut logs = self.logs.write().await;
|
||||||
|
if logs.len() >= MAX_LOGS {
|
||||||
|
logs.pop_front();
|
||||||
|
}
|
||||||
|
logs.push_back(entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle for emitting monitoring events
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct MonitoringHandle {
|
||||||
|
tx: mpsc::UnboundedSender<MonitoringEvent>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MonitoringHandle {
|
||||||
|
pub fn new(tx: mpsc::UnboundedSender<MonitoringEvent>) -> Self {
|
||||||
|
Self { tx }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Emit a monitoring event (non-blocking)
|
||||||
|
pub fn emit(&self, event: MonitoringEvent) {
|
||||||
|
// Ignore send errors (monitoring should never block application)
|
||||||
|
let _ = self.tx.send(event);
|
||||||
|
}
|
||||||
|
}
|
||||||
77
src/monitoring/webserver.rs
Normal file
77
src/monitoring/webserver.rs
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
// src/monitoring/webserver.rs
|
||||||
|
use super::service::MonitoringService;
|
||||||
|
use axum::{
|
||||||
|
extract::{
|
||||||
|
ws::{Message, WebSocket, WebSocketUpgrade},
|
||||||
|
State,
|
||||||
|
},
|
||||||
|
response::{Html, IntoResponse, Response},
|
||||||
|
routing::get,
|
||||||
|
Router,
|
||||||
|
};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
use tokio::time::{interval, Duration};
|
||||||
|
|
||||||
|
const UPDATE_INTERVAL_MS: u64 = 1000; // 1 second updates
|
||||||
|
|
||||||
|
pub struct WebServer {
|
||||||
|
service: Arc<RwLock<MonitoringService>>,
|
||||||
|
port: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WebServer {
|
||||||
|
pub fn new(service: Arc<RwLock<MonitoringService>>, port: u16) -> Self {
|
||||||
|
Self { service, port }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn run(self) -> anyhow::Result<()> {
|
||||||
|
let app = Router::new()
|
||||||
|
.route("/", get(dashboard_handler))
|
||||||
|
.route("/ws", get(websocket_handler))
|
||||||
|
.with_state(self.service);
|
||||||
|
|
||||||
|
let addr = format!("0.0.0.0:{}", self.port);
|
||||||
|
println!("📊 Dashboard available at: http://localhost:{}", self.port);
|
||||||
|
|
||||||
|
let listener = tokio::net::TcpListener::bind(&addr).await?;
|
||||||
|
axum::serve(listener, app).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn dashboard_handler() -> impl IntoResponse {
|
||||||
|
Html(include_str!("dashboard.html"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn websocket_handler(
|
||||||
|
ws: WebSocketUpgrade,
|
||||||
|
State(service): State<Arc<RwLock<MonitoringService>>>,
|
||||||
|
) -> Response {
|
||||||
|
ws.on_upgrade(|socket| handle_socket(socket, service))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_socket(mut socket: WebSocket, service: Arc<RwLock<MonitoringService>>) {
|
||||||
|
let mut ticker = interval(Duration::from_millis(UPDATE_INTERVAL_MS));
|
||||||
|
|
||||||
|
loop {
|
||||||
|
ticker.tick().await;
|
||||||
|
|
||||||
|
let service_guard = service.read().await;
|
||||||
|
let state = service_guard.get_dashboard_state().await;
|
||||||
|
drop(service_guard);
|
||||||
|
|
||||||
|
match serde_json::to_string(&state) {
|
||||||
|
Ok(json) => {
|
||||||
|
if socket.send(Message::Text(json)).await.is_err() {
|
||||||
|
break; // Client disconnected
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Failed to serialize dashboard state: {}", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
516
src/scraper/docker_vpn_proxy.rs
Normal file
516
src/scraper/docker_vpn_proxy.rs
Normal file
@@ -0,0 +1,516 @@
|
|||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use futures::future::join_all;
|
||||||
|
use std::{collections::HashSet, path::{Path, PathBuf}, sync::{Arc, RwLock}, time::Duration};
|
||||||
|
use tokio::{process::Command, time::{sleep}};
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
|
pub struct DockerVpnProxyPool {
|
||||||
|
container_names: Vec<String>,
|
||||||
|
proxy_ports: Vec<u16>, // e.g., [10801, 10802, ...]
|
||||||
|
dead_proxies: Arc<RwLock<HashSet<usize>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DockerVpnProxyPool {
|
||||||
|
pub async fn new(
|
||||||
|
ovpn_dir: &Path,
|
||||||
|
username: String,
|
||||||
|
password: String,
|
||||||
|
instances_per_ovpn: usize,
|
||||||
|
) -> Result<Self> {
|
||||||
|
if instances_per_ovpn == 0 {
|
||||||
|
return Err(anyhow!("instances_per_ovpn must be at least 1"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count hostnames (subdirs in ovpn_dir)
|
||||||
|
let hostnames: Vec<_> = std::fs::read_dir(ovpn_dir)?
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.filter(|e| e.path().is_dir())
|
||||||
|
.map(|e| e.file_name().into_string().unwrap())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let num_servers = hostnames.len();
|
||||||
|
if num_servers == 0 {
|
||||||
|
return Err(anyhow!("No VPN hostnames found in {:?}", ovpn_dir));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate total containers: hostnames × instances_per_ovpn
|
||||||
|
let total_containers = num_servers * instances_per_ovpn;
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Found {} VPN hostnames × {} instances = {} total containers",
|
||||||
|
num_servers, instances_per_ovpn, total_containers
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
let mut container_names = Vec::with_capacity(total_containers);
|
||||||
|
let mut proxy_ports = Vec::with_capacity(total_containers);
|
||||||
|
let base_port: u16 = 10800;
|
||||||
|
let mut port_counter = 0u16;
|
||||||
|
|
||||||
|
// === STEP 1: Start ALL containers first ===
|
||||||
|
for hostname in hostnames.iter() {
|
||||||
|
// Pick tcp443.ovpn if exists, else first .ovpn
|
||||||
|
let hostname_dir = ovpn_dir.join(hostname);
|
||||||
|
let mut ovpn_path: Option<PathBuf> = None;
|
||||||
|
for entry in WalkDir::new(&hostname_dir).max_depth(1) {
|
||||||
|
let entry = entry?;
|
||||||
|
if entry.path().extension().map_or(false, |ext| ext == "ovpn") {
|
||||||
|
if entry.file_name().to_str().unwrap_or("").contains("tcp443") {
|
||||||
|
ovpn_path = Some(entry.path().to_path_buf());
|
||||||
|
break;
|
||||||
|
} else if ovpn_path.is_none() {
|
||||||
|
ovpn_path = Some(entry.path().to_path_buf());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let ovpn_path = ovpn_path.ok_or_else(|| anyhow!("No .ovpn found for {}", hostname))?;
|
||||||
|
|
||||||
|
// Spawn multiple instances for this .ovpn file
|
||||||
|
for instance_num in 0..instances_per_ovpn {
|
||||||
|
let name = format!("vpn-proxy-{}-{}", hostname, instance_num);
|
||||||
|
let port = base_port + port_counter + 1;
|
||||||
|
port_counter += 1;
|
||||||
|
|
||||||
|
// Clean up any existing container with the same name
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["rm", "-f", &name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Run Docker container
|
||||||
|
let status = Command::new("docker")
|
||||||
|
.args([
|
||||||
|
"run", "-d",
|
||||||
|
"--name", &name,
|
||||||
|
"--cap-add=NET_ADMIN",
|
||||||
|
"--device", "/dev/net/tun",
|
||||||
|
"--sysctl", "net.ipv4.ip_forward=1",
|
||||||
|
"-v", &format!("{}:/vpn/config.ovpn", ovpn_path.display()),
|
||||||
|
"-e", &format!("VPN_USERNAME={}", username),
|
||||||
|
"-e", &format!("VPN_PASSWORD={}", password),
|
||||||
|
"-p", &format!("{}:1080", port),
|
||||||
|
"rust-vpn-proxy",
|
||||||
|
])
|
||||||
|
.status()
|
||||||
|
.await
|
||||||
|
.context("Failed to run Docker")?;
|
||||||
|
|
||||||
|
if !status.success() {
|
||||||
|
return Err(anyhow!("Docker run failed for {}", name));
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Started container {} on port {} (using {})",
|
||||||
|
name, port, ovpn_path.file_name().unwrap().to_string_lossy()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
container_names.push(name);
|
||||||
|
proxy_ports.push(port);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Brief pause to let containers start
|
||||||
|
sleep(Duration::from_secs(8)).await;
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"All {} containers started, beginning health checks...",
|
||||||
|
container_names.len()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// === STEP 2: Test ALL proxies in parallel ===
|
||||||
|
let results = Self::test_all_proxies_parallel(&container_names, &proxy_ports).await;
|
||||||
|
|
||||||
|
// Filter out failed containers
|
||||||
|
let mut working_containers = Vec::new();
|
||||||
|
let mut working_ports = Vec::new();
|
||||||
|
let mut failed_count = 0;
|
||||||
|
|
||||||
|
for (i, (container_name, port)) in container_names.into_iter().zip(proxy_ports.into_iter()).enumerate() {
|
||||||
|
match &results[i] {
|
||||||
|
Ok(Some(ip)) => {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"✓ Container {} on port {} ready with IP: {}",
|
||||||
|
container_name, port, ip
|
||||||
|
)).await;
|
||||||
|
working_containers.push(container_name);
|
||||||
|
working_ports.push(port);
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
let logs = Command::new("docker")
|
||||||
|
.args(["logs", "--tail", "20", &container_name])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
.and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into());
|
||||||
|
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"✗ Container {} on port {} ready but IP detection failed. Logs: {:?}",
|
||||||
|
container_name, port, logs
|
||||||
|
)).await;
|
||||||
|
failed_count += 1;
|
||||||
|
// Clean up failed container
|
||||||
|
let _ = Self::cleanup_container(&container_name).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let logs = Command::new("docker")
|
||||||
|
.args(["logs", "--tail", "20", &container_name])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
.and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into());
|
||||||
|
|
||||||
|
crate::util::logger::log_error(&format!(
|
||||||
|
"✗ Container {} on port {} failed: {}. Logs: {:?}",
|
||||||
|
container_name, port, e, logs
|
||||||
|
)).await;
|
||||||
|
failed_count += 1;
|
||||||
|
// Clean up failed container
|
||||||
|
let _ = Self::cleanup_container(&container_name).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if working_containers.is_empty() {
|
||||||
|
return Err(anyhow!("All {} VPN proxy containers failed to start", total_containers));
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Started {}/{} VPN proxy containers successfully ({} hostnames × {} instances)",
|
||||||
|
working_containers.len(), total_containers, num_servers, instances_per_ovpn
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
if failed_count > 0 {
|
||||||
|
crate::util::logger::log_warn(&format!(
|
||||||
|
"{} containers failed and were cleaned up",
|
||||||
|
failed_count
|
||||||
|
)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
container_names: working_containers,
|
||||||
|
proxy_ports: working_ports,
|
||||||
|
dead_proxies: Arc::new(RwLock::new(HashSet::new())),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test all proxies in parallel with 10-second intervals between tests
|
||||||
|
async fn test_all_proxies_parallel(container_names: &[String], proxy_ports: &[u16]) -> Vec<Result<Option<String>>> {
|
||||||
|
let mut tasks = Vec::new();
|
||||||
|
|
||||||
|
for (_i, (container_name, port)) in container_names.iter().zip(proxy_ports.iter()).enumerate() {
|
||||||
|
let name = container_name.clone();
|
||||||
|
let port = *port;
|
||||||
|
|
||||||
|
tasks.push(tokio::spawn(async move {
|
||||||
|
// Try up to 6 times with 10-second intervals (total 60 seconds)
|
||||||
|
for attempt in 1..=6 {
|
||||||
|
crate::util::logger::log_info(&format!("Testing proxy {} (port {}) - Attempt {}/6",
|
||||||
|
name, port, attempt)).await;
|
||||||
|
|
||||||
|
match Self::test_single_proxy(port).await {
|
||||||
|
Ok(Some(ip)) => {
|
||||||
|
return Ok(Some(ip));
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
// Connection works but IP detection failed
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
Err(e) if attempt < 6 => {
|
||||||
|
crate::util::logger::log_info(&format!("Attempt {}/6 for {}: {} - retrying in 10s",
|
||||||
|
attempt, name, e)).await;
|
||||||
|
sleep(Duration::from_secs(10)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
return Err(anyhow!("Failed after 6 attempts: {}", e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(anyhow!("Unexpected exit from retry loop"))
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all tasks to complete
|
||||||
|
join_all(tasks)
|
||||||
|
.await
|
||||||
|
.into_iter()
|
||||||
|
.map(|result| match result {
|
||||||
|
Ok(inner) => inner,
|
||||||
|
Err(e) => Err(anyhow!("Task panicked: {}", e)),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test a single proxy connection
|
||||||
|
async fn test_single_proxy(port: u16) -> Result<Option<String>> {
|
||||||
|
use std::io::{Read, Write};
|
||||||
|
use std::net::TcpStream;
|
||||||
|
use std::time::Duration as StdDuration;
|
||||||
|
|
||||||
|
// First, test SOCKS5 handshake directly
|
||||||
|
crate::util::logger::log_info(&format!("Testing SOCKS5 handshake on port {}...", port)).await;
|
||||||
|
|
||||||
|
// Use spawn_blocking for synchronous I/O
|
||||||
|
let test_result = tokio::task::spawn_blocking(move || {
|
||||||
|
// Connect to SOCKS5 proxy
|
||||||
|
let mut stream = match TcpStream::connect_timeout(
|
||||||
|
&format!("127.0.0.1:{}", port).parse().unwrap(),
|
||||||
|
StdDuration::from_secs(5)
|
||||||
|
) {
|
||||||
|
Ok(stream) => stream,
|
||||||
|
Err(e) => return Err(anyhow!("Failed to connect: {}", e)),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send SOCKS5 greeting: version 5, 1 method (no auth)
|
||||||
|
let greeting: [u8; 3] = [0x05, 0x01, 0x00]; // SOCKS5, 1 method, no auth
|
||||||
|
if let Err(e) = stream.write_all(&greeting) {
|
||||||
|
return Err(anyhow!("Failed to send greeting: {}", e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read response
|
||||||
|
let mut response = [0u8; 2];
|
||||||
|
if let Err(e) = stream.read_exact(&mut response) {
|
||||||
|
return Err(anyhow!("Failed to read response: {}", e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check response: should be [0x05, 0x00] for no auth required
|
||||||
|
if response[0] != 0x05 || response[1] != 0x00 {
|
||||||
|
return Err(anyhow!("Unexpected SOCKS5 response: {:?}", response));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}).await;
|
||||||
|
|
||||||
|
match test_result {
|
||||||
|
Ok(Ok(())) => {
|
||||||
|
crate::util::logger::log_info(&format!("✓ SOCKS5 proxy on port {} accepts connections", port)).await;
|
||||||
|
|
||||||
|
// Try to get IP through proxy using curl (fallback method)
|
||||||
|
let curl_result = tokio::process::Command::new("curl")
|
||||||
|
.args([
|
||||||
|
"-s",
|
||||||
|
"--socks5", &format!("localhost:{}", port),
|
||||||
|
"--max-time", "10",
|
||||||
|
"https://checkip.amazonaws.com"
|
||||||
|
])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match curl_result {
|
||||||
|
Ok(output) if output.status.success() => {
|
||||||
|
let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||||
|
if Self::is_valid_ip(&ip) {
|
||||||
|
crate::util::logger::log_info(&format!("✓ Got IP via proxy: {}", ip)).await;
|
||||||
|
return Ok(Some(ip));
|
||||||
|
} else {
|
||||||
|
crate::util::logger::log_info(&format!("✓ Proxy works, invalid IP format: {}", ip)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Proxy accepts connections but curl failed - still acceptable
|
||||||
|
crate::util::logger::log_info(&format!("✓ Proxy accepts connections (curl test failed)")).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
return Err(anyhow!("SOCKS5 test failed: {}", e));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
return Err(anyhow!("Task failed: {}", e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clean up a failed container
|
||||||
|
async fn cleanup_container(container_name: &str) -> Result<()> {
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["stop", container_name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["rm", container_name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_valid_ip(ip: &str) -> bool {
|
||||||
|
let parts: Vec<&str> = ip.split('.').collect();
|
||||||
|
if parts.len() != 4 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for part in parts {
|
||||||
|
if let Ok(num) = part.parse::<u8>() {
|
||||||
|
if part != num.to_string() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_proxy_url(&self, index: usize) -> String {
|
||||||
|
let port = self.proxy_ports[index % self.proxy_ports.len()];
|
||||||
|
format!("socks5h://localhost:{}", port)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn num_proxies(&self) -> usize {
|
||||||
|
self.proxy_ports.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn shutdown(&self) -> Result<()> {
|
||||||
|
crate::util::logger::log_info(&format!("Shutting down {} Docker proxy containers...",
|
||||||
|
self.container_names.len())).await;
|
||||||
|
|
||||||
|
for name in &self.container_names {
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["stop", name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["rm", name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get ProxyInfo for monitoring dashboard
|
||||||
|
pub fn get_proxy_info(&self, index: usize) -> Option<crate::monitoring::ProxyInfo> {
|
||||||
|
if index >= self.container_names.len() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(crate::monitoring::ProxyInfo {
|
||||||
|
container_name: self.container_names[index].clone(),
|
||||||
|
ip_address: "127.0.0.1".to_string(), // SOCKS5 proxy on localhost
|
||||||
|
port: self.proxy_ports[index],
|
||||||
|
status: crate::monitoring::ProxyStatus::Connected,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get container name by index
|
||||||
|
pub fn get_container_name(&self, index: usize) -> Option<String> {
|
||||||
|
self.container_names.get(index).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a healthy proxy URL (skips dead proxies)
|
||||||
|
pub async fn get_healthy_proxy_url(&self, start_index: usize) -> Option<(usize, String)> {
|
||||||
|
let dead = match self.dead_proxies.read() {
|
||||||
|
Ok(value) => value,
|
||||||
|
Err(_) => return None,
|
||||||
|
};
|
||||||
|
let total = self.proxy_ports.len();
|
||||||
|
|
||||||
|
// Try up to 'total' proxies starting from start_index
|
||||||
|
for attempt in 0..total {
|
||||||
|
let index = (start_index + attempt) % total;
|
||||||
|
|
||||||
|
// Skip if dead
|
||||||
|
if dead.contains(&index) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let port = self.proxy_ports[index];
|
||||||
|
return Some((index, format!("socks5h://localhost:{}", port)));
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark a proxy as dead
|
||||||
|
pub async fn mark_proxy_dead(&self, index: usize) -> Option<bool> {
|
||||||
|
// Acquire lock, perform mutation, and get values for logging
|
||||||
|
let (port, remaining, total) = {
|
||||||
|
let mut dead = match self.dead_proxies.write() {
|
||||||
|
Ok(value) => value,
|
||||||
|
Err(_) => return None,
|
||||||
|
};
|
||||||
|
dead.insert(index);
|
||||||
|
|
||||||
|
let port = self.proxy_ports.get(index).copied().unwrap_or(0);
|
||||||
|
let remaining = self.proxy_ports.len() - dead.len();
|
||||||
|
let total = self.proxy_ports.len();
|
||||||
|
|
||||||
|
// Lock is automatically dropped here when the scope ends
|
||||||
|
(port, remaining, total)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Now we can await without holding the lock
|
||||||
|
crate::util::logger::log_warn(&format!(
|
||||||
|
"⚠ Marked proxy {} (port {}) as DEAD ({}/{} proxies remaining)",
|
||||||
|
index,
|
||||||
|
port,
|
||||||
|
remaining,
|
||||||
|
total
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
Some(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get count of healthy proxies
|
||||||
|
pub async fn num_healthy_proxies(&self) -> Option<usize> {
|
||||||
|
let dead = match self.dead_proxies.read() {
|
||||||
|
Ok(value) => value,
|
||||||
|
Err(_) => return None,
|
||||||
|
};
|
||||||
|
Some(self.proxy_ports.len() - dead.len())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn cleanup_all_proxy_containers() -> Result<()> {
|
||||||
|
// Step 1: List all container IDs that match our pattern
|
||||||
|
let output = Command::new("docker")
|
||||||
|
.args(["ps", "-a", "--format", "{{.ID}} {{.Names}} {{.Image}}"])
|
||||||
|
.output()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
|
let mut containers_to_kill = Vec::new();
|
||||||
|
|
||||||
|
for line in stdout.lines() {
|
||||||
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
let name_or_id = parts[0];
|
||||||
|
let name = parts[1];
|
||||||
|
let image = if parts.len() >= 3 { parts[2] } else { "" };
|
||||||
|
|
||||||
|
// Match by name prefix OR by image name
|
||||||
|
if name.starts_with("vpn-proxy-") || image.contains("rust-vpn-proxy") {
|
||||||
|
containers_to_kill.push(name_or_id.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if containers_to_kill.is_empty() {
|
||||||
|
crate::util::logger::log_info("No old rust-vpn-proxy containers found").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Kill and remove them all at once
|
||||||
|
let status = Command::new("docker")
|
||||||
|
.arg("rm")
|
||||||
|
.arg("-f")
|
||||||
|
.args(&containers_to_kill)
|
||||||
|
.status()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if status.success() {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Successfully removed {} old rust-vpn-proxy container(s)",
|
||||||
|
containers_to_kill.len()
|
||||||
|
))
|
||||||
|
.await;
|
||||||
|
} else {
|
||||||
|
crate::util::logger::log_warn("Some containers may still remain (non-critical)").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
377
src/scraper/hard_reset.rs
Normal file
377
src/scraper/hard_reset.rs
Normal file
@@ -0,0 +1,377 @@
|
|||||||
|
// src/scraper/hard_reset.rs - FIXED: Proper cleanup without Arc leaks
|
||||||
|
use std::sync::{Arc, atomic::{AtomicBool, AtomicUsize, Ordering}};
|
||||||
|
|
||||||
|
use crate::{ChromeDriverPool, Config, logger, scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers}, util::directories::DataPaths};
|
||||||
|
|
||||||
|
/// Simple error counter for triggering hard resets
|
||||||
|
pub struct HardResetController {
|
||||||
|
consecutive_errors: AtomicUsize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HardResetController {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
consecutive_errors: AtomicUsize::new(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Record success - resets counter
|
||||||
|
pub fn record_success(&self) {
|
||||||
|
self.consecutive_errors.store(0, Ordering::SeqCst);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Record error - returns new count
|
||||||
|
pub fn record_error(&self) -> usize {
|
||||||
|
self.consecutive_errors.fetch_add(1, Ordering::SeqCst) + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reset counter
|
||||||
|
pub fn reset(&self) {
|
||||||
|
self.consecutive_errors.store(0, Ordering::SeqCst);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current count
|
||||||
|
pub fn get_count(&self) -> usize {
|
||||||
|
self.consecutive_errors.load(Ordering::SeqCst)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ✅ FIXED: Perform hard reset without Arc reference leaks
|
||||||
|
///
|
||||||
|
/// Key improvements:
|
||||||
|
/// 1. Don't clone old_pool - just shutdown through mutex guard
|
||||||
|
/// 2. Verify all processes killed before creating new pool
|
||||||
|
/// 3. Explicitly shutdown temp pools with error handling
|
||||||
|
/// 4. Add process counting/verification
|
||||||
|
pub async fn perform_hard_reset(
|
||||||
|
pool_mutex: &Arc<tokio::sync::Mutex<Arc<ChromeDriverPool>>>,
|
||||||
|
config: &Config,
|
||||||
|
paths: &DataPaths,
|
||||||
|
monitoring: &Option<crate::monitoring::MonitoringHandle>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
//let number_proxy_instances = config.proxy_instances_per_certificate.unwrap_or(1);
|
||||||
|
logger::log_error("🔴 STARTING HARD RESET SEQUENCE").await;
|
||||||
|
|
||||||
|
// Check if shutdown was requested
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown requested during hard reset, aborting").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== STEP 1: ACQUIRE POOL LOCK (NO CLONING!) =====
|
||||||
|
logger::log_info(" [1/12] Acquiring pool lock...").await;
|
||||||
|
let mut pool_guard = pool_mutex.lock().await;
|
||||||
|
|
||||||
|
// Get instance count before shutdown for verification
|
||||||
|
let old_instance_count = pool_guard.get_number_of_instances();
|
||||||
|
logger::log_info(&format!(" [1/12] Pool has {} instances", old_instance_count)).await;
|
||||||
|
|
||||||
|
// ===== STEP 2: SHUTDOWN OLD POOL (NO ARC CLONE!) =====
|
||||||
|
logger::log_info(" [2/12] Shutting down old pool (NO Arc clone)...").await;
|
||||||
|
|
||||||
|
// Shutdown through the Arc without cloning it
|
||||||
|
// This is safe because we hold the mutex lock
|
||||||
|
match pool_guard.shutdown().await {
|
||||||
|
Ok(()) => {
|
||||||
|
logger::log_info(" [2/12] ✓ Pool shutdown complete").await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!(" [2/12] ✗ Pool shutdown error: {}", e)).await;
|
||||||
|
// Continue anyway - we'll force-kill processes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== STEP 3: FORCE-KILL ANY REMAINING CHROME PROCESSES =====
|
||||||
|
logger::log_info(" [3/12] Force-killing any remaining Chrome/ChromeDriver processes...").await;
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
{
|
||||||
|
// Kill all chrome.exe processes
|
||||||
|
let chrome_result = tokio::process::Command::new("taskkill")
|
||||||
|
.args(["/F", "/IM", "chrome.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match chrome_result {
|
||||||
|
Ok(output) if output.status.success() => {
|
||||||
|
logger::log_info(" [3/12] ✓ Chrome processes killed").await;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
logger::log_info(" [3/12] ⊘ No Chrome processes found").await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kill all chromedriver.exe processes
|
||||||
|
let chromedriver_result = tokio::process::Command::new("taskkill")
|
||||||
|
.args(["/F", "/IM", "chromedriver.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match chromedriver_result {
|
||||||
|
Ok(output) if output.status.success() => {
|
||||||
|
logger::log_info(" [3/12] ✓ ChromeDriver processes killed").await;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
logger::log_info(" [3/12] ⊘ No ChromeDriver processes found").await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "windows"))]
|
||||||
|
{
|
||||||
|
// Kill all chrome processes
|
||||||
|
let _ = tokio::process::Command::new("pkill")
|
||||||
|
.arg("chrome")
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let _ = tokio::process::Command::new("pkill")
|
||||||
|
.arg("chromedriver")
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
logger::log_info(" [3/12] ✓ Force-killed Chrome/ChromeDriver").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== STEP 4: SHUTDOWN PROXIES =====
|
||||||
|
logger::log_info(" [4/12] Shutting down proxy containers...").await;
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
|
||||||
|
// ===== STEP 5: WAIT FOR CLEANUP =====
|
||||||
|
logger::log_info(" [5/12] Waiting 30 seconds for cleanup...").await;
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(30)).await;
|
||||||
|
|
||||||
|
// ===== STEP 6: VERIFY CLEANUP =====
|
||||||
|
logger::log_info(" [6/12] Verifying process cleanup...").await;
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
{
|
||||||
|
let check_chrome = tokio::process::Command::new("tasklist")
|
||||||
|
.args(["/FI", "IMAGENAME eq chrome.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
if let Ok(output) = check_chrome {
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
|
||||||
|
|
||||||
|
if chrome_count > 0 {
|
||||||
|
logger::log_warn(&format!(" [6/12] ⚠️ {} Chrome processes still running!", chrome_count)).await;
|
||||||
|
} else {
|
||||||
|
logger::log_info(" [6/12] ✓ No Chrome processes running").await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check shutdown again
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown requested during cleanup, aborting reset").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== STEP 7: RECREATE PROXY POOL =====
|
||||||
|
logger::log_info(" [7/12] Recreating proxy pool...").await;
|
||||||
|
let new_proxy_pool = if config.enable_vpn_rotation {
|
||||||
|
match recreate_proxy_pool_with_fresh_credentials(config, paths, monitoring, shutdown_flag).await {
|
||||||
|
Ok(pool) => {
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" [7/12] ✓ Proxy pool created with {} proxies",
|
||||||
|
pool.num_proxies()
|
||||||
|
)).await;
|
||||||
|
Some(pool)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!(
|
||||||
|
" [7/12] ⚠️ Proxy creation failed: {}. Continuing without proxies.",
|
||||||
|
e
|
||||||
|
)).await;
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger::log_info(" [7/12] ⊘ VPN rotation disabled, skipping proxy pool").await;
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// ===== STEP 8: RECREATE CHROMEDRIVER POOL =====
|
||||||
|
logger::log_info(" [8/12] Recreating ChromeDriver pool...").await;
|
||||||
|
let new_pool = Arc::new(
|
||||||
|
ChromeDriverPool::new_with_proxy_and_task_limit(
|
||||||
|
new_proxy_pool,
|
||||||
|
config,
|
||||||
|
monitoring.clone(),
|
||||||
|
).await?
|
||||||
|
);
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" [8/12] ✓ ChromeDriver pool created with {} instances",
|
||||||
|
new_pool.get_number_of_instances()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// ===== STEP 9: RESET ERROR COUNTER =====
|
||||||
|
logger::log_info(" [9/12] Resetting error counter...").await;
|
||||||
|
new_pool.get_reset_controller().reset();
|
||||||
|
logger::log_info(" [9/12] ✓ Error counter cleared").await;
|
||||||
|
|
||||||
|
// ===== STEP 10: REPLACE POOL ATOMICALLY =====
|
||||||
|
logger::log_info(" [10/12] Activating new pool...").await;
|
||||||
|
*pool_guard = new_pool;
|
||||||
|
drop(pool_guard);
|
||||||
|
logger::log_info(" [10/12] ✓ New pool activated").await;
|
||||||
|
|
||||||
|
// ===== STEP 11: EMIT MONITORING EVENT =====
|
||||||
|
logger::log_info(" [11/12] Updating monitoring...").await;
|
||||||
|
if let Some(mon) = monitoring {
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::PoolInitialized {
|
||||||
|
pool_size: config.max_parallel_instances,
|
||||||
|
with_proxy: config.enable_vpn_rotation,
|
||||||
|
with_rotation: config.max_tasks_per_instance > 0,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== STEP 12: FINAL VERIFICATION =====
|
||||||
|
logger::log_info(" [12/12] Final verification...").await;
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
{
|
||||||
|
let check_chrome = tokio::process::Command::new("tasklist")
|
||||||
|
.args(["/FI", "IMAGENAME eq chrome.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
if let Ok(output) = check_chrome {
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
|
||||||
|
logger::log_info(&format!(" [12/12] Chrome processes: {}", chrome_count)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
let check_chromedriver = tokio::process::Command::new("tasklist")
|
||||||
|
.args(["/FI", "IMAGENAME eq chromedriver.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
if let Ok(output) = check_chromedriver {
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let chromedriver_count = stdout.lines().filter(|line| line.contains("chromedriver.exe")).count();
|
||||||
|
logger::log_info(&format!(" [12/12] ChromeDriver processes: {}", chromedriver_count)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("✅ HARD RESET COMPLETE").await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ✅ FIXED: Recreate proxy pool with temp pool that's properly shut down
|
||||||
|
async fn recreate_proxy_pool_with_fresh_credentials(
|
||||||
|
config: &Config,
|
||||||
|
paths: &DataPaths,
|
||||||
|
monitoring: &Option<crate::monitoring::MonitoringHandle>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<Arc<DockerVpnProxyPool>> {
|
||||||
|
|
||||||
|
let number_proxy_instances = config.proxy_instances_per_certificate.unwrap_or(1);
|
||||||
|
|
||||||
|
// Check shutdown
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow::anyhow!("Shutdown requested during proxy recreation"));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(" [7.1] Creating temporary ChromeDriver pool for credential fetch...").await;
|
||||||
|
|
||||||
|
// Create temporary pool WITHOUT proxy
|
||||||
|
let temp_pool = Arc::new(
|
||||||
|
ChromeDriverPool::new_with_proxy_and_task_limit(
|
||||||
|
None, // No proxy for temp pool
|
||||||
|
config,
|
||||||
|
monitoring.clone(),
|
||||||
|
).await?
|
||||||
|
);
|
||||||
|
|
||||||
|
logger::log_info(" [7.2] Fetching fresh VPNBook credentials...").await;
|
||||||
|
|
||||||
|
// Fetch fresh VPNBook credentials
|
||||||
|
let (username, password, _files) = crate::util::opnv::fetch_vpnbook_configs(
|
||||||
|
&temp_pool,
|
||||||
|
paths.cache_dir()
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!(" [7.3] Got credentials → User: {}", username)).await;
|
||||||
|
|
||||||
|
// ✅ FIXED: Properly shutdown temp pool with error handling
|
||||||
|
logger::log_info(" [7.4] Shutting down temporary pool...").await;
|
||||||
|
match temp_pool.shutdown().await {
|
||||||
|
Ok(()) => {
|
||||||
|
logger::log_info(" [7.4] ✓ Temp pool shut down successfully").await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!(" [7.4] ✗ Temp pool shutdown error: {}", e)).await;
|
||||||
|
// Force-kill processes as backup
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
{
|
||||||
|
let _ = tokio::process::Command::new("taskkill")
|
||||||
|
.args(["/F", "/IM", "chrome.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
let _ = tokio::process::Command::new("taskkill")
|
||||||
|
.args(["/F", "/IM", "chromedriver.exe"])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait a moment for temp pool cleanup
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
// Check shutdown again
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Err(anyhow::anyhow!("Shutdown requested during proxy recreation"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we have VPN server configs
|
||||||
|
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
||||||
|
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if server_count == 0 {
|
||||||
|
return Err(anyhow::anyhow!("No VPN servers found after credential fetch"));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" [7.5] Found {} VPN servers → Creating proxy pool with {} instances per server...",
|
||||||
|
server_count,
|
||||||
|
number_proxy_instances
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Create new proxy pool
|
||||||
|
let proxy_pool = Arc::new(
|
||||||
|
DockerVpnProxyPool::new(
|
||||||
|
paths.cache_openvpn_dir(),
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
number_proxy_instances,
|
||||||
|
).await?
|
||||||
|
);
|
||||||
|
|
||||||
|
logger::log_info(&format!(
|
||||||
|
" [7.6] ✓ Proxy pool ready with {} total proxies",
|
||||||
|
proxy_pool.num_proxies()
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
// Emit proxy connected events for monitoring
|
||||||
|
if let Some(mon) = monitoring {
|
||||||
|
for i in 0..proxy_pool.num_proxies() {
|
||||||
|
if let Some(proxy_info) = proxy_pool.get_proxy_info(i) {
|
||||||
|
mon.emit(crate::monitoring::MonitoringEvent::ProxyConnected {
|
||||||
|
container_name: proxy_info.container_name.clone(),
|
||||||
|
ip_address: proxy_info.ip_address.clone(),
|
||||||
|
port: proxy_info.port,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(proxy_pool)
|
||||||
|
}
|
||||||
14
src/scraper/helpers.rs
Normal file
14
src/scraper/helpers.rs
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::prelude::{Rng, SeedableRng, IndexedRandom};
|
||||||
|
|
||||||
|
/// Send-safe random range
|
||||||
|
pub fn random_range(min: u64, max: u64) -> u64 {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
rng.random_range(min..max)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send-safe random choice
|
||||||
|
pub fn choose_random<T: Clone>(items: &[T]) -> T {
|
||||||
|
let mut rng = StdRng::from_rng(&mut rand::rng());
|
||||||
|
items.choose(&mut rng).unwrap().clone()
|
||||||
|
}
|
||||||
@@ -1 +1,6 @@
|
|||||||
pub mod webdriver;
|
pub mod webdriver;
|
||||||
|
pub mod docker_vpn_proxy;
|
||||||
|
pub mod helpers;
|
||||||
|
pub mod hard_reset;
|
||||||
|
pub mod yahoo;
|
||||||
|
pub mod openfigi;
|
||||||
367
src/scraper/openfigi.rs
Normal file
367
src/scraper/openfigi.rs
Normal file
@@ -0,0 +1,367 @@
|
|||||||
|
// src/scraper/openfigi.rs - STREAMING VERSION
|
||||||
|
// Key changes: Never load entire GLEIF CSV or FIGI maps into memory
|
||||||
|
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
|
use crate::corporate::{types::*};
|
||||||
|
use reqwest::Client as HttpClient;
|
||||||
|
use reqwest::header::{HeaderMap, HeaderValue};
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
use std::path::Path;
|
||||||
|
use tokio::time::{sleep, Duration};
|
||||||
|
use tokio::fs as tokio_fs;
|
||||||
|
use anyhow::{Context, anyhow};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct OpenFigiClient {
|
||||||
|
pub client: HttpClient,
|
||||||
|
pub has_key: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OpenFigiClient {
|
||||||
|
pub async fn new() -> anyhow::Result<Self> {
|
||||||
|
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
|
||||||
|
let has_key = api_key.is_some();
|
||||||
|
|
||||||
|
let mut builder = HttpClient::builder()
|
||||||
|
.user_agent("Mozilla/5.0 (compatible; OpenFIGI-Rust/1.0)")
|
||||||
|
.timeout(Duration::from_secs(30));
|
||||||
|
|
||||||
|
if let Some(key) = &api_key {
|
||||||
|
let mut headers = HeaderMap::new();
|
||||||
|
headers.insert("X-OPENFIGI-APIKEY", HeaderValue::from_str(key)?);
|
||||||
|
builder = builder.default_headers(headers);
|
||||||
|
}
|
||||||
|
|
||||||
|
let client = builder.build().context("Failed to build HTTP client")?;
|
||||||
|
logger::log_info(&format!("OpenFIGI client: {}",
|
||||||
|
if has_key { "with API key" } else { "no key" })).await;
|
||||||
|
|
||||||
|
Ok(Self { client, has_key })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn map_isins_to_figi_infos(&self, isins: &[String]) -> anyhow::Result<Vec<FigiData>> {
|
||||||
|
if isins.is_empty() {
|
||||||
|
return Ok(vec![]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut all_figi_infos = Vec::new();
|
||||||
|
let chunk_size = if self.has_key { 100 } else { 5 };
|
||||||
|
let inter_sleep = if self.has_key {
|
||||||
|
Duration::from_millis(240)
|
||||||
|
} else {
|
||||||
|
Duration::from_millis(2400)
|
||||||
|
};
|
||||||
|
|
||||||
|
for chunk in isins.chunks(chunk_size) {
|
||||||
|
let jobs: Vec<Value> = chunk.iter()
|
||||||
|
.map(|isin| json!({
|
||||||
|
"idType": "ID_ISIN",
|
||||||
|
"idValue": isin,
|
||||||
|
}))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut retry_count = 0;
|
||||||
|
let max_retries = 5;
|
||||||
|
let mut backoff_ms = 1000u64;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let resp_result = self.client
|
||||||
|
.post("https://api.openfigi.com/v3/mapping")
|
||||||
|
.header("Content-Type", "application/json")
|
||||||
|
.json(&jobs)
|
||||||
|
.send()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let resp = match resp_result {
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
retry_count += 1;
|
||||||
|
if retry_count >= max_retries {
|
||||||
|
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
|
||||||
|
logger::log_error(&err_msg).await;
|
||||||
|
return Err(anyhow!(err_msg));
|
||||||
|
}
|
||||||
|
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
|
||||||
|
logger::log_info(&retry_msg).await;
|
||||||
|
sleep(Duration::from_millis(backoff_ms)).await;
|
||||||
|
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let status = resp.status();
|
||||||
|
let headers = resp.headers().clone();
|
||||||
|
let body = resp.text().await?;
|
||||||
|
|
||||||
|
if status == 429 {
|
||||||
|
let reset_sec = headers
|
||||||
|
.get("ratelimit-reset")
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.and_then(|s| s.parse::<u64>().ok())
|
||||||
|
.unwrap_or(10);
|
||||||
|
sleep(Duration::from_secs(reset_sec.max(10))).await;
|
||||||
|
continue;
|
||||||
|
} else if !status.is_success() {
|
||||||
|
if status.is_server_error() && retry_count < max_retries {
|
||||||
|
retry_count += 1;
|
||||||
|
sleep(Duration::from_millis(backoff_ms)).await;
|
||||||
|
backoff_ms = (backoff_ms * 2).min(60000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return Err(anyhow!("OpenFIGI error {}: {}", status, body));
|
||||||
|
}
|
||||||
|
|
||||||
|
let results: Vec<Value> = serde_json::from_str(&body)?;
|
||||||
|
|
||||||
|
for (isin, result) in chunk.iter().zip(results) {
|
||||||
|
if let Some(data) = result["data"].as_array() {
|
||||||
|
for item in data {
|
||||||
|
if let Some(figi) = item["figi"].as_str() {
|
||||||
|
all_figi_infos.push(FigiData {
|
||||||
|
isin: isin.clone(),
|
||||||
|
figi: figi.to_string(),
|
||||||
|
name: item["name"].as_str().unwrap_or("").to_string(),
|
||||||
|
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
|
||||||
|
exch_code: item["exchCode"].as_str().unwrap_or("").to_string(),
|
||||||
|
composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
|
||||||
|
security_type: item["securityType"].as_str().unwrap_or("").to_string(),
|
||||||
|
market_sector: item["marketSector"].as_str().unwrap_or("").to_string(),
|
||||||
|
share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
|
||||||
|
security_type2: item["securityType2"].as_str().unwrap_or("").to_string(),
|
||||||
|
security_description: item["securityDescription"].as_str().unwrap_or("").to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(inter_sleep).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(all_figi_infos)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetches and caches the list of valid securityType values.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `client` - The OpenFIGI client instance.
|
||||||
|
/// * `cache_dir` - Directory to save the cached JSON file.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) on success.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if the API request fails or file I/O fails.
|
||||||
|
async fn get_figi_security_type(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
|
||||||
|
let cache_file = cache_dir.join("securityType.json");
|
||||||
|
|
||||||
|
if should_use_cache(&cache_file).await? {
|
||||||
|
logger::log_info(" Using cached securityType values").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(" Fetching securityType values from OpenFIGI API...").await;
|
||||||
|
|
||||||
|
let resp = client.client
|
||||||
|
.get("https://api.openfigi.com/v3/mapping/values/securityType")
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.context("Failed to fetch securityType values")?;
|
||||||
|
|
||||||
|
handle_rate_limit(&resp).await?;
|
||||||
|
|
||||||
|
let values: Value = resp.json().await
|
||||||
|
.context("Failed to parse securityType response")?;
|
||||||
|
|
||||||
|
let json_str = serde_json::to_string_pretty(&values)?;
|
||||||
|
tokio_fs::write(&cache_file, json_str).await
|
||||||
|
.context("Failed to write securityType cache")?;
|
||||||
|
|
||||||
|
logger::log_info(" ✓ Cached securityType values").await;
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Loads all OpenFIGI mapping value lists (marketSecDes, micCode, securityType).
|
||||||
|
///
|
||||||
|
/// This function fetches the available values for each mapping parameter from the OpenFIGI API
|
||||||
|
/// and caches them as JSON files in `data/openfigi/`. If the files already exist and are recent
|
||||||
|
/// (less than 30 days old), they are reused instead of re-fetching.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) on success.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if API requests fail, JSON parsing fails, or file I/O fails.
|
||||||
|
pub async fn load_figi_type_lists(paths: &DataPaths) -> anyhow::Result<()> {
|
||||||
|
logger::log_info("Loading OpenFIGI mapping value lists...").await;
|
||||||
|
|
||||||
|
let cache_openfigi_dir = paths.cache_openfigi_dir();
|
||||||
|
tokio_fs::create_dir_all(cache_openfigi_dir).await
|
||||||
|
.context("Failed to create data/openfigi directory")?;
|
||||||
|
|
||||||
|
let client = OpenFigiClient::new().await?;
|
||||||
|
|
||||||
|
// Fetch each type list
|
||||||
|
get_figi_market_sec_des(&client, cache_openfigi_dir).await?;
|
||||||
|
get_figi_mic_code(&client, cache_openfigi_dir).await?;
|
||||||
|
get_figi_security_type(&client, cache_openfigi_dir).await?;
|
||||||
|
|
||||||
|
logger::log_info("OpenFIGI mapping value lists loaded successfully").await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetches and caches the list of valid marketSecDes values.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `client` - The OpenFIGI client instance.
|
||||||
|
/// * `cache_dir` - Directory to save the cached JSON file.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) on success.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if the API request fails or file I/O fails.
|
||||||
|
async fn get_figi_market_sec_des(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
|
||||||
|
let cache_file = cache_dir.join("marketSecDes.json");
|
||||||
|
|
||||||
|
// Check if cache exists and is recent (< 30 days old)
|
||||||
|
if should_use_cache(&cache_file).await? {
|
||||||
|
logger::log_info(" Using cached marketSecDes values").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(" Fetching marketSecDes values from OpenFIGI API...").await;
|
||||||
|
|
||||||
|
let resp = client.client
|
||||||
|
.get("https://api.openfigi.com/v3/mapping/values/marketSecDes")
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.context("Failed to fetch marketSecDes values")?;
|
||||||
|
|
||||||
|
handle_rate_limit(&resp).await?;
|
||||||
|
|
||||||
|
let values: Value = resp.json().await
|
||||||
|
.context("Failed to parse marketSecDes response")?;
|
||||||
|
|
||||||
|
// Save to cache
|
||||||
|
let json_str = serde_json::to_string_pretty(&values)?;
|
||||||
|
tokio_fs::write(&cache_file, json_str).await
|
||||||
|
.context("Failed to write marketSecDes cache")?;
|
||||||
|
|
||||||
|
logger::log_info(" ✓ Cached marketSecDes values").await;
|
||||||
|
|
||||||
|
// Respect rate limits
|
||||||
|
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetches and caches the list of valid micCode values.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `client` - The OpenFIGI client instance.
|
||||||
|
/// * `cache_dir` - Directory to save the cached JSON file.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) on success.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if the API request fails or file I/O fails.
|
||||||
|
async fn get_figi_mic_code(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
|
||||||
|
let cache_file = cache_dir.join("micCode.json");
|
||||||
|
|
||||||
|
if should_use_cache(&cache_file).await? {
|
||||||
|
logger::log_info(" Using cached micCode values").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(" Fetching micCode values from OpenFIGI API...").await;
|
||||||
|
|
||||||
|
let resp = client.client
|
||||||
|
.get("https://api.openfigi.com/v3/mapping/values/micCode")
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.context("Failed to fetch micCode values")?;
|
||||||
|
|
||||||
|
handle_rate_limit(&resp).await?;
|
||||||
|
|
||||||
|
let values: Value = resp.json().await
|
||||||
|
.context("Failed to parse micCode response")?;
|
||||||
|
|
||||||
|
let json_str = serde_json::to_string_pretty(&values)?;
|
||||||
|
tokio_fs::write(&cache_file, json_str).await
|
||||||
|
.context("Failed to write micCode cache")?;
|
||||||
|
|
||||||
|
logger::log_info(" ✓ Cached micCode values").await;
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handles rate limit responses from the OpenFIGI API.
|
||||||
|
///
|
||||||
|
/// If a 429 status is received, this function sleeps for the duration specified
|
||||||
|
/// in the `ratelimit-reset` header (or 10 seconds by default).
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `resp` - The HTTP response to check.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) if no rate limit, or after waiting for the reset period.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if the response status indicates a non-rate-limit error.
|
||||||
|
async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> {
|
||||||
|
let status = resp.status();
|
||||||
|
|
||||||
|
if status == 429 {
|
||||||
|
let headers = resp.headers();
|
||||||
|
let reset_sec = headers
|
||||||
|
.get("ratelimit-reset")
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.and_then(|s| s.parse::<u64>().ok())
|
||||||
|
.unwrap_or(10);
|
||||||
|
|
||||||
|
logger::log_info(&format!(" Rate limited—waiting {}s", reset_sec)).await;
|
||||||
|
sleep(std::time::Duration::from_secs(reset_sec.max(10))).await;
|
||||||
|
|
||||||
|
return Err(anyhow!("Rate limited, please retry"));
|
||||||
|
} else if status.is_client_error() || status.is_server_error() {
|
||||||
|
return Err(anyhow!("OpenFIGI API error: {}", status));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks if a cache file exists and is less than 30 days old.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `path` - Path to the cache file.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// True if the cache should be used, false if it needs refreshing.
|
||||||
|
async fn should_use_cache(path: &Path) -> anyhow::Result<bool> {
|
||||||
|
if !path.exists() {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let metadata = tokio_fs::metadata(path).await?;
|
||||||
|
let modified = metadata.modified()?;
|
||||||
|
let age = modified.elapsed().unwrap_or(std::time::Duration::from_secs(u64::MAX));
|
||||||
|
|
||||||
|
// Cache is valid for 30 days
|
||||||
|
Ok(age < std::time::Duration::from_secs(30 * 24 * 60 * 60))
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
1519
src/scraper/yahoo.rs
Normal file
1519
src/scraper/yahoo.rs
Normal file
File diff suppressed because it is too large
Load Diff
22
src/util.rs
22
src/util.rs
@@ -1,22 +0,0 @@
|
|||||||
// src/util.rs (or put it directly in main.rs if you prefer)
|
|
||||||
use tokio::fs;
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
/// Create the required data folders if they do not exist yet.
|
|
||||||
pub async fn _ensure_data_dirs() -> anyhow::Result<()> {
|
|
||||||
let dirs = [
|
|
||||||
"economic_events",
|
|
||||||
"economic_event_changes",
|
|
||||||
"corporate_events",
|
|
||||||
"corporate_prices",
|
|
||||||
"data",
|
|
||||||
];
|
|
||||||
for dir in dirs {
|
|
||||||
let path = Path::new(dir);
|
|
||||||
if !path.exists() {
|
|
||||||
tokio::fs::create_dir_all(path).await?;
|
|
||||||
println!("Created directory: {dir}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
175
src/util/directories.rs
Normal file
175
src/util/directories.rs
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
/// Central configuration for all data paths
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct DataPaths {
|
||||||
|
base_dir: PathBuf,
|
||||||
|
data_dir: PathBuf,
|
||||||
|
cache_dir: PathBuf,
|
||||||
|
logs_dir: PathBuf,
|
||||||
|
integrity_dir: PathBuf,
|
||||||
|
// Cache data subdirectories
|
||||||
|
cache_gleif_dir: PathBuf,
|
||||||
|
cache_openfigi_dir: PathBuf,
|
||||||
|
cache_gleif_openfigi_map_dir: PathBuf,
|
||||||
|
cache_openvpn_dir: PathBuf,
|
||||||
|
// Figi Securities data subdirectories
|
||||||
|
figi_securities_dir: PathBuf,
|
||||||
|
// Economic data subdirectories
|
||||||
|
economic_events_dir: PathBuf,
|
||||||
|
economic_changes_dir: PathBuf,
|
||||||
|
economic_currency_dir: PathBuf,
|
||||||
|
// Corporate data subdirectories
|
||||||
|
corporate_dir: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DataPaths {
|
||||||
|
/// Initialize paths from a base directory
|
||||||
|
pub fn new(base_dir: impl AsRef<Path>) -> std::io::Result<Self> {
|
||||||
|
let base_dir = base_dir.as_ref().to_path_buf();
|
||||||
|
|
||||||
|
let data_dir = base_dir.join("data");
|
||||||
|
let cache_dir = base_dir.join("cache");
|
||||||
|
let logs_dir = base_dir.join("logs");
|
||||||
|
let integrity_dir = base_dir.join("integrity");
|
||||||
|
|
||||||
|
// Cache subdirectories
|
||||||
|
let cache_gleif_dir = cache_dir.join("gleif");
|
||||||
|
let cache_openfigi_dir = cache_dir.join("openfigi");
|
||||||
|
let cache_gleif_openfigi_map_dir = cache_dir.join("glei_openfigi");
|
||||||
|
let cache_openvpn_dir = cache_dir.join("openvpn");
|
||||||
|
|
||||||
|
// Figi Securities subdirectories
|
||||||
|
let figi_securities_dir = data_dir.join("figi_securities");
|
||||||
|
|
||||||
|
// Economic subdirectories
|
||||||
|
let economic_events_dir = data_dir.join("economic").join("events");
|
||||||
|
let economic_changes_dir = economic_events_dir.join("changes");
|
||||||
|
let economic_currency_dir = data_dir.join("economic").join("currency");
|
||||||
|
|
||||||
|
// Corporate subdirectories
|
||||||
|
let corporate_dir = data_dir.join("corporate");
|
||||||
|
|
||||||
|
// Create all directories if they don't exist
|
||||||
|
fs::create_dir_all(&data_dir)?;
|
||||||
|
fs::create_dir_all(&cache_dir)?;
|
||||||
|
fs::create_dir_all(&logs_dir)?;
|
||||||
|
fs::create_dir_all(&integrity_dir)?;
|
||||||
|
fs::create_dir_all(&cache_gleif_dir)?;
|
||||||
|
fs::create_dir_all(&cache_openfigi_dir)?;
|
||||||
|
fs::create_dir_all(&cache_gleif_openfigi_map_dir)?;
|
||||||
|
fs::create_dir_all(&cache_openvpn_dir)?;
|
||||||
|
fs::create_dir_all(&figi_securities_dir)?;
|
||||||
|
fs::create_dir_all(&economic_events_dir)?;
|
||||||
|
fs::create_dir_all(&economic_changes_dir)?;
|
||||||
|
fs::create_dir_all(&economic_currency_dir)?;
|
||||||
|
fs::create_dir_all(&corporate_dir)?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
base_dir,
|
||||||
|
data_dir,
|
||||||
|
cache_dir,
|
||||||
|
logs_dir,
|
||||||
|
integrity_dir,
|
||||||
|
cache_gleif_dir,
|
||||||
|
cache_openfigi_dir,
|
||||||
|
cache_gleif_openfigi_map_dir,
|
||||||
|
cache_openvpn_dir,
|
||||||
|
figi_securities_dir,
|
||||||
|
economic_events_dir,
|
||||||
|
economic_changes_dir,
|
||||||
|
economic_currency_dir,
|
||||||
|
corporate_dir,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn base_dir(&self) -> &Path {
|
||||||
|
&self.base_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn data_dir(&self) -> &Path {
|
||||||
|
&self.data_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_dir(&self) -> &Path {
|
||||||
|
&self.cache_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn integrity_dir(&self) -> &Path {
|
||||||
|
&self.integrity_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn logs_dir(&self) -> &Path {
|
||||||
|
&self.logs_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_gleif_dir(&self) -> &Path {
|
||||||
|
&self.cache_gleif_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_openfigi_dir(&self) -> &Path {
|
||||||
|
&self.cache_openfigi_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_gleif_openfigi_map_dir(&self) -> &Path {
|
||||||
|
&self.cache_gleif_openfigi_map_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_openvpn_dir(&self) -> &Path {
|
||||||
|
&self.cache_openvpn_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn figi_securities_dir(&self) -> &Path {
|
||||||
|
&self.figi_securities_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the economic events directory
|
||||||
|
pub fn economic_events_dir(&self) -> &Path {
|
||||||
|
&self.economic_events_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the economic changes directory
|
||||||
|
pub fn economic_changes_dir(&self) -> &Path {
|
||||||
|
&self.economic_changes_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn economic_currency_dir(&self) -> &Path {
|
||||||
|
&self.economic_currency_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the corporate events directory
|
||||||
|
pub fn corporate_dir(&self) -> &Path {
|
||||||
|
&self.corporate_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a specific file path within data directory
|
||||||
|
pub fn data_file(&self, filename: &str) -> PathBuf {
|
||||||
|
self.data_dir.join(filename)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a specific file path within cache directory
|
||||||
|
pub fn cache_file(&self, filename: &str) -> PathBuf {
|
||||||
|
self.cache_dir.join(filename)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a specific file path within logs directory
|
||||||
|
pub fn log_file(&self, filename: &str) -> PathBuf {
|
||||||
|
self.logs_dir.join(filename)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_paths_creation() {
|
||||||
|
let paths = DataPaths::new("./test_base").unwrap();
|
||||||
|
assert!(paths.data_dir().exists());
|
||||||
|
assert!(paths.cache_dir().exists());
|
||||||
|
assert!(paths.logs_dir().exists());
|
||||||
|
assert!(paths.economic_events_dir().exists());
|
||||||
|
assert!(paths.economic_changes_dir().exists());
|
||||||
|
}
|
||||||
|
}
|
||||||
911
src/util/integrity.rs
Normal file
911
src/util/integrity.rs
Normal file
@@ -0,0 +1,911 @@
|
|||||||
|
// src/util/integrity.rs
|
||||||
|
//! Content integrity and state lifecycle management module
|
||||||
|
//!
|
||||||
|
//! Features:
|
||||||
|
//! - File and directory hashing (SHA-256)
|
||||||
|
//! - Hash validation against content references
|
||||||
|
//! - State invalidation based on time or validation failures
|
||||||
|
//! - 3-stage data lifecycle: cache → data → storage
|
||||||
|
//! - Inline vs. external hash storage based on size
|
||||||
|
//! - Centralized dependency configuration (Single Source of Truth)
|
||||||
|
//! - Support for checkpoint groups and hierarchies
|
||||||
|
//! - Automatic transitive dependency resolution
|
||||||
|
//! - Cycle detection in dependency graph
|
||||||
|
|
||||||
|
use anyhow::{Context, Result, bail};
|
||||||
|
use chrono::{DateTime, Duration, Utc};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sha2::{Digest, Sha256};
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::fs;
|
||||||
|
use std::io::{BufReader, Read};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use tokio::fs as async_fs;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONSTANTS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
const INLINE_HASH_THRESHOLD: usize = 1024;
|
||||||
|
const HASH_STORAGE_DIR: &str = ".integrity_hashes";
|
||||||
|
const HASH_FILE_EXT: &str = ".hash";
|
||||||
|
const DEFAULT_DEPENDENCY_CONFIG: &str = "checkpoint_dependencies.toml";
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// DEPENDENCY CONFIGURATION
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||||
|
pub struct DependencyConfig {
|
||||||
|
#[serde(default)]
|
||||||
|
pub checkpoints: HashMap<String, CheckpointConfig>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub groups: HashMap<String, GroupConfig>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CheckpointConfig {
|
||||||
|
#[serde(default)]
|
||||||
|
pub description: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub depends_on: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub group: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct GroupConfig {
|
||||||
|
#[serde(default)]
|
||||||
|
pub description: String,
|
||||||
|
pub members: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub depends_on: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DependencyConfig {
|
||||||
|
/// Load from file or return empty config
|
||||||
|
pub async fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
if !path.exists() {
|
||||||
|
return Ok(Self::default());
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = async_fs::read_to_string(path).await
|
||||||
|
.with_context(|| format!("Failed to read: {}", path.display()))?;
|
||||||
|
|
||||||
|
let config: Self = toml::from_str(&content)
|
||||||
|
.context("Failed to parse dependency config")?;
|
||||||
|
|
||||||
|
config.validate()?;
|
||||||
|
Ok(config)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate configuration (checks for cycles and invalid references)
|
||||||
|
pub fn validate(&self) -> Result<()> {
|
||||||
|
// Check for cycles
|
||||||
|
for checkpoint in self.checkpoints.keys() {
|
||||||
|
self.detect_cycle(checkpoint)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate group memberships
|
||||||
|
for (group_name, group) in &self.groups {
|
||||||
|
for member in &group.members {
|
||||||
|
if !self.checkpoints.contains_key(member) {
|
||||||
|
bail!("Group '{}' references unknown checkpoint: {}", group_name, member);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate checkpoint group declarations
|
||||||
|
for (checkpoint_name, checkpoint) in &self.checkpoints {
|
||||||
|
if let Some(group_name) = &checkpoint.group {
|
||||||
|
let group = self.groups.get(group_name)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Checkpoint '{}' references unknown group: {}", checkpoint_name, group_name))?;
|
||||||
|
|
||||||
|
if !group.members.contains(checkpoint_name) {
|
||||||
|
bail!("Checkpoint '{}' claims group '{}' but group doesn't list it",
|
||||||
|
checkpoint_name, group_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect cycles using DFS
|
||||||
|
fn detect_cycle(&self, start: &str) -> Result<()> {
|
||||||
|
let mut visited = HashSet::new();
|
||||||
|
let mut stack = HashSet::new();
|
||||||
|
self.dfs_cycle_check(start, &mut visited, &mut stack)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dfs_cycle_check(&self, node: &str, visited: &mut HashSet<String>, stack: &mut HashSet<String>) -> Result<()> {
|
||||||
|
if stack.contains(node) {
|
||||||
|
bail!("Cycle detected at checkpoint: {}", node);
|
||||||
|
}
|
||||||
|
if visited.contains(node) {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
visited.insert(node.to_string());
|
||||||
|
stack.insert(node.to_string());
|
||||||
|
|
||||||
|
if let Some(config) = self.checkpoints.get(node) {
|
||||||
|
for dep in &config.depends_on {
|
||||||
|
self.dfs_cycle_check(dep, visited, stack)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stack.remove(node);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all dependencies (including transitive and group dependencies)
|
||||||
|
pub fn get_all_dependencies(&self, checkpoint: &str) -> Result<Vec<String>> {
|
||||||
|
let mut deps = Vec::new();
|
||||||
|
let mut visited = HashSet::new();
|
||||||
|
self.collect_deps(checkpoint, &mut deps, &mut visited)?;
|
||||||
|
|
||||||
|
// Remove duplicates while preserving order
|
||||||
|
let mut seen = HashSet::new();
|
||||||
|
deps.retain(|d| seen.insert(d.clone()));
|
||||||
|
|
||||||
|
Ok(deps)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_deps(&self, node: &str, deps: &mut Vec<String>, visited: &mut HashSet<String>) -> Result<()> {
|
||||||
|
if visited.contains(node) {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
visited.insert(node.to_string());
|
||||||
|
|
||||||
|
let config = self.checkpoints.get(node)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Unknown checkpoint: {}", node))?;
|
||||||
|
|
||||||
|
// Add group dependencies first
|
||||||
|
if let Some(group_name) = &config.group {
|
||||||
|
if let Some(group) = self.groups.get(group_name) {
|
||||||
|
for dep in &group.depends_on {
|
||||||
|
if !visited.contains(dep) {
|
||||||
|
deps.push(dep.clone());
|
||||||
|
self.collect_deps(dep, deps, visited)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add direct dependencies
|
||||||
|
for dep in &config.depends_on {
|
||||||
|
if !visited.contains(dep) {
|
||||||
|
deps.push(dep.clone());
|
||||||
|
self.collect_deps(dep, deps, visited)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate DOT format for visualization
|
||||||
|
pub fn to_dot(&self) -> String {
|
||||||
|
let mut dot = String::from("digraph Dependencies {\n rankdir=LR;\n node [shape=box];\n\n");
|
||||||
|
|
||||||
|
// Nodes
|
||||||
|
for (name, config) in &self.checkpoints {
|
||||||
|
let label = if config.description.is_empty() {
|
||||||
|
name.clone()
|
||||||
|
} else {
|
||||||
|
format!("{}\\n{}", name, config.description)
|
||||||
|
};
|
||||||
|
dot.push_str(&format!(" \"{}\" [label=\"{}\"];\n", name, label));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Edges
|
||||||
|
dot.push_str("\n");
|
||||||
|
for (name, config) in &self.checkpoints {
|
||||||
|
// Group dependencies
|
||||||
|
if let Some(group_name) = &config.group {
|
||||||
|
if let Some(group) = self.groups.get(group_name) {
|
||||||
|
for dep in &group.depends_on {
|
||||||
|
dot.push_str(&format!(" \"{}\" -> \"{}\" [label=\"via {}\"];\n", name, dep, group_name));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Direct dependencies
|
||||||
|
for dep in &config.depends_on {
|
||||||
|
dot.push_str(&format!(" \"{}\" -> \"{}\";\n", name, dep));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dot.push_str("}\n");
|
||||||
|
dot
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// DATA STRUCTURES
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(tag = "type", rename_all = "lowercase")]
|
||||||
|
pub enum ContentReference {
|
||||||
|
File { path: PathBuf },
|
||||||
|
Directory {
|
||||||
|
path: PathBuf,
|
||||||
|
include_patterns: Option<Vec<String>>,
|
||||||
|
exclude_patterns: Option<Vec<String>>,
|
||||||
|
},
|
||||||
|
Composite { references: Vec<ContentReference> },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(tag = "storage", rename_all = "lowercase")]
|
||||||
|
pub enum HashStorage {
|
||||||
|
Inline { hash: String },
|
||||||
|
External { hash_file: PathBuf },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum DataStage {
|
||||||
|
Cache,
|
||||||
|
Data,
|
||||||
|
Storage,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DataStage {
|
||||||
|
pub fn default_ttl(&self) -> Duration {
|
||||||
|
match self {
|
||||||
|
Self::Cache => Duration::hours(24),
|
||||||
|
Self::Data => Duration::days(7),
|
||||||
|
Self::Storage => Duration::days(365),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn revalidation_interval(&self) -> Duration {
|
||||||
|
match self {
|
||||||
|
Self::Cache => Duration::hours(6),
|
||||||
|
Self::Data => Duration::days(1),
|
||||||
|
Self::Storage => Duration::days(30),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct StateEntry {
|
||||||
|
pub step_name: String,
|
||||||
|
pub completed: bool,
|
||||||
|
pub completed_at: Option<DateTime<Utc>>,
|
||||||
|
pub content_reference: Option<ContentReference>,
|
||||||
|
pub content_hash: Option<HashStorage>,
|
||||||
|
pub data_stage: Option<DataStage>,
|
||||||
|
pub ttl_override: Option<Duration>,
|
||||||
|
pub last_validated_at: Option<DateTime<Utc>>,
|
||||||
|
pub validation_status: ValidationStatus,
|
||||||
|
#[serde(default)]
|
||||||
|
pub dependencies: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum ValidationStatus {
|
||||||
|
Unknown,
|
||||||
|
Valid,
|
||||||
|
Invalid { reason: String },
|
||||||
|
Expired,
|
||||||
|
DependencyFailed { failed_dependency: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// HASH COMPUTATION
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/// Hash a single file using SHA-256
|
||||||
|
pub fn hash_file<P: AsRef<Path>>(path: P) -> Result<String> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
let file = fs::File::open(path)
|
||||||
|
.with_context(|| format!("Failed to open: {}", path.display()))?;
|
||||||
|
|
||||||
|
let mut reader = BufReader::new(file);
|
||||||
|
let mut hasher = Sha256::new();
|
||||||
|
let mut buffer = [0u8; 8192];
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let bytes_read = reader.read(&mut buffer)?;
|
||||||
|
if bytes_read == 0 { break; }
|
||||||
|
hasher.update(&buffer[..bytes_read]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(format!("{:x}", hasher.finalize()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Hash a directory recursively
|
||||||
|
pub fn hash_directory<P: AsRef<Path>>(
|
||||||
|
path: P,
|
||||||
|
include_patterns: Option<&[String]>,
|
||||||
|
exclude_patterns: Option<&[String]>,
|
||||||
|
) -> Result<String> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
if !path.is_dir() {
|
||||||
|
bail!("Not a directory: {}", path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut files = Vec::new();
|
||||||
|
collect_files_recursive(path, &mut files, include_patterns, exclude_patterns)?;
|
||||||
|
files.sort();
|
||||||
|
|
||||||
|
if files.is_empty() {
|
||||||
|
return Ok(String::from("d41d8cd98f00b204e9800998ecf8427e")); // Empty hash
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut hasher = Sha256::new();
|
||||||
|
for file_path in files {
|
||||||
|
let rel_path = file_path.strip_prefix(path)
|
||||||
|
.unwrap_or(&file_path)
|
||||||
|
.to_string_lossy();
|
||||||
|
hasher.update(rel_path.as_bytes());
|
||||||
|
hasher.update(hash_file(&file_path)?.as_bytes());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(format!("{:x}", hasher.finalize()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_files_recursive(
|
||||||
|
dir: &Path,
|
||||||
|
files: &mut Vec<PathBuf>,
|
||||||
|
include: Option<&[String]>,
|
||||||
|
exclude: Option<&[String]>,
|
||||||
|
) -> Result<()> {
|
||||||
|
for entry in fs::read_dir(dir)? {
|
||||||
|
let path = entry?.path();
|
||||||
|
|
||||||
|
// Skip hidden files
|
||||||
|
if path.file_name()
|
||||||
|
.and_then(|n| n.to_str())
|
||||||
|
.map_or(false, |n| n.starts_with('.')) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.is_dir() {
|
||||||
|
collect_files_recursive(&path, files, include, exclude)?;
|
||||||
|
} else if path.is_file() && should_include(&path, include, exclude) {
|
||||||
|
files.push(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn should_include(path: &Path, include: Option<&[String]>, exclude: Option<&[String]>) -> bool {
|
||||||
|
let path_str = path.to_string_lossy();
|
||||||
|
|
||||||
|
// Check exclusions first
|
||||||
|
if let Some(patterns) = exclude {
|
||||||
|
if patterns.iter().any(|p| glob_match(&path_str, p)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check inclusions
|
||||||
|
match include {
|
||||||
|
Some(patterns) => patterns.iter().any(|p| glob_match(&path_str, p)),
|
||||||
|
None => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn glob_match(path: &str, pattern: &str) -> bool {
|
||||||
|
if pattern.contains('*') {
|
||||||
|
let parts: Vec<&str> = pattern.split('*').collect();
|
||||||
|
if parts.len() == 2 {
|
||||||
|
path.contains(parts[0]) && path.ends_with(parts[1])
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
path.ends_with(pattern)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Hash content based on reference type
|
||||||
|
pub fn hash_content_reference(reference: &ContentReference) -> Result<String> {
|
||||||
|
match reference {
|
||||||
|
ContentReference::File { path } => hash_file(path),
|
||||||
|
ContentReference::Directory { path, include_patterns, exclude_patterns } => {
|
||||||
|
hash_directory(path, include_patterns.as_deref(), exclude_patterns.as_deref())
|
||||||
|
}
|
||||||
|
ContentReference::Composite { references } => {
|
||||||
|
let mut hasher = Sha256::new();
|
||||||
|
for ref_item in references {
|
||||||
|
hasher.update(hash_content_reference(ref_item)?.as_bytes());
|
||||||
|
}
|
||||||
|
Ok(format!("{:x}", hasher.finalize()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// HASH STORAGE
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
fn determine_storage(hash: &str, base_dir: &Path) -> HashStorage {
|
||||||
|
if hash.len() > INLINE_HASH_THRESHOLD {
|
||||||
|
let hash_dir = base_dir.join(HASH_STORAGE_DIR);
|
||||||
|
let hash_file = hash_dir.join(format!("{}{}", &hash[..16], HASH_FILE_EXT));
|
||||||
|
HashStorage::External { hash_file }
|
||||||
|
} else {
|
||||||
|
HashStorage::Inline { hash: hash.to_string() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn store_hash(hash: &str, storage: &HashStorage) -> Result<()> {
|
||||||
|
if let HashStorage::External { hash_file } = storage {
|
||||||
|
if let Some(parent) = hash_file.parent() {
|
||||||
|
async_fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
async_fs::write(hash_file, hash.as_bytes()).await?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_hash(storage: &HashStorage) -> Result<String> {
|
||||||
|
match storage {
|
||||||
|
HashStorage::Inline { hash } => Ok(hash.clone()),
|
||||||
|
HashStorage::External { hash_file } => {
|
||||||
|
Ok(async_fs::read_to_string(hash_file).await?.trim().to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// VALIDATION
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/// Validate a single state entry
|
||||||
|
async fn validate_entry(entry: &StateEntry) -> Result<ValidationStatus> {
|
||||||
|
// Check if completed
|
||||||
|
if !entry.completed {
|
||||||
|
return Ok(ValidationStatus::Unknown);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get content reference and hash
|
||||||
|
let (content_ref, hash_storage) = match (&entry.content_reference, &entry.content_hash) {
|
||||||
|
(Some(r), Some(h)) => (r, h),
|
||||||
|
_ => return Ok(ValidationStatus::Unknown),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Load stored hash
|
||||||
|
let stored_hash = load_hash(hash_storage).await?;
|
||||||
|
|
||||||
|
// Compute current hash
|
||||||
|
let current_hash = match hash_content_reference(content_ref) {
|
||||||
|
Ok(h) => h,
|
||||||
|
Err(e) => return Ok(ValidationStatus::Invalid {
|
||||||
|
reason: format!("Failed to compute hash: {}", e)
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check hash match
|
||||||
|
if stored_hash != current_hash {
|
||||||
|
return Ok(ValidationStatus::Invalid { reason: "Hash mismatch".to_string() });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check TTL
|
||||||
|
if let Some(stage) = entry.data_stage {
|
||||||
|
let ttl = entry.ttl_override.unwrap_or_else(|| stage.default_ttl());
|
||||||
|
if let Some(completed_at) = entry.completed_at {
|
||||||
|
if Utc::now() - completed_at > ttl {
|
||||||
|
return Ok(ValidationStatus::Expired);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ValidationStatus::Valid)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate all entries with cascade invalidation
|
||||||
|
async fn validate_all_entries(entries: &mut HashMap<String, StateEntry>) -> Result<ValidationReport> {
|
||||||
|
let mut report = ValidationReport::default();
|
||||||
|
|
||||||
|
// Validate each entry
|
||||||
|
for (name, entry) in entries.iter_mut() {
|
||||||
|
let status = validate_entry(entry).await?;
|
||||||
|
entry.validation_status = status.clone();
|
||||||
|
entry.last_validated_at = Some(Utc::now());
|
||||||
|
|
||||||
|
match status {
|
||||||
|
ValidationStatus::Valid => report.valid_count += 1,
|
||||||
|
ValidationStatus::Invalid { .. } => {
|
||||||
|
report.invalid_count += 1;
|
||||||
|
report.invalid_entries.push(name.clone());
|
||||||
|
}
|
||||||
|
ValidationStatus::Expired => {
|
||||||
|
report.expired_count += 1;
|
||||||
|
report.expired_entries.push(name.clone());
|
||||||
|
}
|
||||||
|
ValidationStatus::Unknown => report.unknown_count += 1,
|
||||||
|
ValidationStatus::DependencyFailed { .. } => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cascade invalidation
|
||||||
|
let mut invalidated: HashSet<String> = report.invalid_entries.iter().cloned().collect();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let mut newly_invalidated = Vec::new();
|
||||||
|
|
||||||
|
for (name, entry) in entries.iter() {
|
||||||
|
if invalidated.contains(name) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if any dependency is invalidated
|
||||||
|
if let Some(failed_dep) = entry.dependencies.iter().find(|d| invalidated.contains(*d)) {
|
||||||
|
newly_invalidated.push((name.clone(), failed_dep.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if newly_invalidated.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (name, failed_dep) in newly_invalidated {
|
||||||
|
invalidated.insert(name.clone());
|
||||||
|
report.cascaded_invalidations.push(name.clone());
|
||||||
|
|
||||||
|
if let Some(entry) = entries.get_mut(&name) {
|
||||||
|
entry.validation_status = ValidationStatus::DependencyFailed { failed_dependency: failed_dep };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(report)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct ValidationReport {
|
||||||
|
pub valid_count: usize,
|
||||||
|
pub invalid_count: usize,
|
||||||
|
pub expired_count: usize,
|
||||||
|
pub unknown_count: usize,
|
||||||
|
pub invalid_entries: Vec<String>,
|
||||||
|
pub expired_entries: Vec<String>,
|
||||||
|
pub cascaded_invalidations: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ValidationReport {
|
||||||
|
pub fn print_summary(&self) {
|
||||||
|
println!("=== Validation Report ===");
|
||||||
|
println!("Valid: {}", self.valid_count);
|
||||||
|
println!("Invalid: {}", self.invalid_count);
|
||||||
|
println!("Expired: {}", self.expired_count);
|
||||||
|
println!("Unknown: {}", self.unknown_count);
|
||||||
|
|
||||||
|
if !self.invalid_entries.is_empty() {
|
||||||
|
println!("\nInvalid entries:");
|
||||||
|
for entry in &self.invalid_entries {
|
||||||
|
println!(" - {}", entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.expired_entries.is_empty() {
|
||||||
|
println!("\nExpired entries:");
|
||||||
|
for entry in &self.expired_entries {
|
||||||
|
println!(" - {}", entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.cascaded_invalidations.is_empty() {
|
||||||
|
println!("\nCascaded invalidations:");
|
||||||
|
for entry in &self.cascaded_invalidations {
|
||||||
|
println!(" - {}", entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// STATE MANAGEMENT
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/// State manager with centralized dependency configuration
|
||||||
|
///
|
||||||
|
/// # Orchestration: Shutdown Flag + State Management
|
||||||
|
///
|
||||||
|
/// ## Happy Path (Normal Completion)
|
||||||
|
/// 1. Work completes successfully
|
||||||
|
/// 2. Call `update_entry()` with `completed: true`
|
||||||
|
/// 3. StateEntry saved with timestamp and valid hash
|
||||||
|
/// 4. On next run: skips already-completed step
|
||||||
|
///
|
||||||
|
/// ## Shutdown Path (Interrupted Work)
|
||||||
|
/// 1. Shutdown flag is set via Ctrl+C handler
|
||||||
|
/// 2. Long-running code checks: `if shutdown_flag.load(Ordering::SeqCst) { break }`
|
||||||
|
/// 3. Before returning, call `mark_invalid()`
|
||||||
|
/// 4. StateEntry saved with `completed: false` and ValidationStatus::Invalid
|
||||||
|
/// 5. On next run: retries invalid step
|
||||||
|
///
|
||||||
|
/// ## Usage Pattern
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// let manager = StateManager::new(&paths.integrity_dir()).await?;
|
||||||
|
/// let content_ref = directory_reference(&output_dir, None, None);
|
||||||
|
///
|
||||||
|
/// loop {
|
||||||
|
/// if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
/// manager.mark_invalid(
|
||||||
|
/// step_name.to_string(),
|
||||||
|
/// Some(content_ref.clone()),
|
||||||
|
/// Some(DataStage::Data),
|
||||||
|
/// "invalid due to shutdown".to_string(),
|
||||||
|
/// ).await?;
|
||||||
|
/// return Ok(());
|
||||||
|
/// }
|
||||||
|
/// // Do work...
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// // Completed successfully
|
||||||
|
/// manager.update_entry(step_name.to_string(), content_ref, DataStage::Data, None).await?;
|
||||||
|
/// ```
|
||||||
|
pub struct StateManager {
|
||||||
|
base_dir: PathBuf,
|
||||||
|
dependency_config: DependencyConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StateManager {
|
||||||
|
/// Create new state manager and load dependency configuration
|
||||||
|
pub async fn new<P: AsRef<Path>>(base_dir: P) -> Result<Self> {
|
||||||
|
let base_dir = base_dir.as_ref().to_path_buf();
|
||||||
|
let config_path = base_dir.join(DEFAULT_DEPENDENCY_CONFIG);
|
||||||
|
let dependency_config = DependencyConfig::from_file(config_path).await?;
|
||||||
|
|
||||||
|
Ok(Self { base_dir, dependency_config })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create with explicit dependency configuration
|
||||||
|
pub fn with_config<P: AsRef<Path>>(base_dir: P, dependency_config: DependencyConfig) -> Result<Self> {
|
||||||
|
dependency_config.validate()?;
|
||||||
|
Ok(Self {
|
||||||
|
base_dir: base_dir.as_ref().to_path_buf(),
|
||||||
|
dependency_config,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the dependency configuration
|
||||||
|
pub fn get_dependency_config(&self) -> &DependencyConfig {
|
||||||
|
&self.dependency_config
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load all state entries from state.jsonl
|
||||||
|
pub async fn load_entries(&self) -> Result<HashMap<String, StateEntry>> {
|
||||||
|
let state_file = self.base_dir.join("state.jsonl");
|
||||||
|
if !state_file.exists() {
|
||||||
|
return Ok(HashMap::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = async_fs::read_to_string(&state_file).await?;
|
||||||
|
let mut entries = HashMap::new();
|
||||||
|
|
||||||
|
for line in content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Ok(entry) = serde_json::from_str::<StateEntry>(line) {
|
||||||
|
entries.insert(entry.step_name.clone(), entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(entries)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save all state entries to state.jsonl
|
||||||
|
pub async fn save_entries(&self, entries: &HashMap<String, StateEntry>) -> Result<()> {
|
||||||
|
if let Some(parent) = self.base_dir.parent() {
|
||||||
|
async_fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut file = async_fs::File::create(self.base_dir.join("state.jsonl")).await?;
|
||||||
|
|
||||||
|
for entry in entries.values() {
|
||||||
|
file.write_all((serde_json::to_string(&entry)? + "\n").as_bytes()).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
file.sync_all().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an empty entry for a step (can be updated later)
|
||||||
|
///
|
||||||
|
/// Creates a placeholder entry that marks the step as incomplete and unknown,
|
||||||
|
/// allowing you to later mark it as valid or invalid via `mark_valid()` or `mark_invalid()`.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
/// ```rust
|
||||||
|
/// let manager = StateManager::new(&paths.integrity_dir()).await?;
|
||||||
|
///
|
||||||
|
/// // Start tracking a long step
|
||||||
|
/// let mut entry = manager.create_entry("long_operation".to_string()).await?;
|
||||||
|
///
|
||||||
|
/// // Do work...
|
||||||
|
///
|
||||||
|
/// // Mark as valid when done
|
||||||
|
/// entry.content_reference = Some(content_ref);
|
||||||
|
/// entry.data_stage = Some(DataStage::Data);
|
||||||
|
/// manager.mark_valid(entry).await?;
|
||||||
|
/// ```
|
||||||
|
pub async fn create_entry(&self, step_name: String, content_reference: ContentReference, data_stage: DataStage) -> Result<StateEntry> {
|
||||||
|
// Resolve dependencies from configuration
|
||||||
|
let dependencies = self.dependency_config
|
||||||
|
.get_all_dependencies(&step_name)
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Create empty entry with Unknown status
|
||||||
|
let entry = StateEntry {
|
||||||
|
step_name: step_name.clone(),
|
||||||
|
completed: false,
|
||||||
|
completed_at: None,
|
||||||
|
content_reference: Some(content_reference),
|
||||||
|
content_hash: None,
|
||||||
|
data_stage: Some(data_stage),
|
||||||
|
ttl_override: None,
|
||||||
|
last_validated_at: Some(Utc::now()),
|
||||||
|
validation_status: ValidationStatus::Unknown,
|
||||||
|
dependencies,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Update and save
|
||||||
|
let mut entries = self.load_entries().await?;
|
||||||
|
entries.insert(step_name, entry.clone());
|
||||||
|
self.save_entries(&entries).await?;
|
||||||
|
|
||||||
|
Ok(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a StateEntry as valid and save to disk
|
||||||
|
///
|
||||||
|
/// Updates the entry with:
|
||||||
|
/// - `completed: true`
|
||||||
|
/// - `completed_at: now`
|
||||||
|
/// - `validation_status: Valid`
|
||||||
|
/// - Computes and stores content hash
|
||||||
|
///
|
||||||
|
/// # Requires
|
||||||
|
/// - `entry.content_reference` must be `Some()`
|
||||||
|
/// - `entry.data_stage` must be `Some()`
|
||||||
|
pub async fn mark_valid(&self, mut entry: StateEntry) -> Result<StateEntry> {
|
||||||
|
// Get content reference and data stage (required)
|
||||||
|
let content_reference = entry.content_reference.as_ref()
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("content_reference is required to mark entry valid"))?;
|
||||||
|
let data_stage = entry.data_stage
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("data_stage is required to mark entry valid"))?;
|
||||||
|
|
||||||
|
// Compute and store hash
|
||||||
|
let hash = hash_content_reference(content_reference)?;
|
||||||
|
let storage = determine_storage(&hash, &self.base_dir);
|
||||||
|
store_hash(&hash, &storage).await?;
|
||||||
|
|
||||||
|
// Update entry
|
||||||
|
entry.completed = true;
|
||||||
|
entry.completed_at = Some(Utc::now());
|
||||||
|
entry.content_hash = Some(storage);
|
||||||
|
entry.data_stage = Some(data_stage);
|
||||||
|
entry.last_validated_at = Some(Utc::now());
|
||||||
|
entry.validation_status = ValidationStatus::Valid;
|
||||||
|
|
||||||
|
// Save
|
||||||
|
let mut entries = self.load_entries().await?;
|
||||||
|
entries.insert(entry.step_name.clone(), entry.clone());
|
||||||
|
self.save_entries(&entries).await?;
|
||||||
|
|
||||||
|
Ok(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a StateEntry as invalid and save to disk
|
||||||
|
///
|
||||||
|
/// Updates the entry with:
|
||||||
|
/// - `completed: false`
|
||||||
|
/// - `completed_at: None`
|
||||||
|
/// - `validation_status: Invalid { reason }`
|
||||||
|
pub async fn mark_invalid(&self, mut entry: StateEntry, reason: String) -> Result<StateEntry> {
|
||||||
|
// Update entry
|
||||||
|
entry.completed = false;
|
||||||
|
entry.completed_at = None;
|
||||||
|
entry.last_validated_at = Some(Utc::now());
|
||||||
|
entry.validation_status = ValidationStatus::Invalid { reason };
|
||||||
|
|
||||||
|
// Save
|
||||||
|
let mut entries = self.load_entries().await?;
|
||||||
|
entries.insert(entry.step_name.clone(), entry.clone());
|
||||||
|
self.save_entries(&entries).await?;
|
||||||
|
|
||||||
|
Ok(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a step is valid and completed
|
||||||
|
pub async fn is_step_valid(&self, step_name: &str) -> Result<bool> {
|
||||||
|
let entries = self.load_entries().await?;
|
||||||
|
|
||||||
|
if let Some(entry) = entries.get(step_name) {
|
||||||
|
let status = validate_entry(entry).await?;
|
||||||
|
Ok(matches!(status, ValidationStatus::Valid))
|
||||||
|
} else {
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run full validation on all entries
|
||||||
|
pub async fn validate_all(&self) -> Result<ValidationReport> {
|
||||||
|
let mut entries = self.load_entries().await?;
|
||||||
|
let report = validate_all_entries(&mut entries).await?;
|
||||||
|
self.save_entries(&entries).await?;
|
||||||
|
Ok(report)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Print dependency graph information
|
||||||
|
pub fn print_dependency_graph(&self) {
|
||||||
|
println!("=== Dependency Configuration ===");
|
||||||
|
println!("\nCheckpoints: {}", self.dependency_config.checkpoints.len());
|
||||||
|
println!("Groups: {}", self.dependency_config.groups.len());
|
||||||
|
|
||||||
|
println!("\n--- Checkpoints ---");
|
||||||
|
for (name, config) in &self.dependency_config.checkpoints {
|
||||||
|
println!("{}", name);
|
||||||
|
if !config.description.is_empty() {
|
||||||
|
println!(" Description: {}", config.description);
|
||||||
|
}
|
||||||
|
if let Some(group) = &config.group {
|
||||||
|
println!(" Group: {}", group);
|
||||||
|
}
|
||||||
|
if !config.depends_on.is_empty() {
|
||||||
|
println!(" Depends on: {}", config.depends_on.join(", "));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show resolved dependencies
|
||||||
|
if let Ok(all_deps) = self.dependency_config.get_all_dependencies(name) {
|
||||||
|
if !all_deps.is_empty() {
|
||||||
|
println!(" Resolved (including transitive): {}", all_deps.join(", "));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("\n--- Groups ---");
|
||||||
|
for (name, group) in &self.dependency_config.groups {
|
||||||
|
println!("{}", name);
|
||||||
|
if !group.description.is_empty() {
|
||||||
|
println!(" Description: {}", group.description);
|
||||||
|
}
|
||||||
|
println!(" Members: {}", group.members.join(", "));
|
||||||
|
if !group.depends_on.is_empty() {
|
||||||
|
println!(" Group dependencies: {}", group.depends_on.join(", "));
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// HELPER FUNCTIONS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/// Create a simple file reference
|
||||||
|
pub fn file_reference<P: AsRef<Path>>(path: P) -> ContentReference {
|
||||||
|
ContentReference::File { path: path.as_ref().to_path_buf() }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a directory reference
|
||||||
|
pub fn directory_reference<P: AsRef<Path>>(
|
||||||
|
path: P,
|
||||||
|
include_patterns: Option<Vec<String>>,
|
||||||
|
exclude_patterns: Option<Vec<String>>,
|
||||||
|
) -> ContentReference {
|
||||||
|
ContentReference::Directory {
|
||||||
|
path: path.as_ref().to_path_buf(),
|
||||||
|
include_patterns,
|
||||||
|
exclude_patterns,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a composite reference
|
||||||
|
pub fn composite_reference(references: Vec<ContentReference>) -> ContentReference {
|
||||||
|
ContentReference::Composite { references }
|
||||||
|
}
|
||||||
78
src/util/logger.rs
Normal file
78
src/util/logger.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
// src/util/logger.rs
|
||||||
|
use chrono::Local;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
use std::fs::{self, OpenOptions};
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
static LOGGER: Lazy<Mutex<Option<DebugLogger>>> = Lazy::new(|| Mutex::new(None));
|
||||||
|
|
||||||
|
pub struct DebugLogger {
|
||||||
|
file: std::fs::File,
|
||||||
|
log_path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DebugLogger {
|
||||||
|
fn new(log_dir: &std::path::Path) -> std::io::Result<Self> {
|
||||||
|
|
||||||
|
fs::create_dir_all(log_dir)?;
|
||||||
|
let filename = format!("backtest_{}.log", Local::now().format("%Y%m%d_%H%M%S"));
|
||||||
|
let log_path = log_dir.join(&filename);
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)?;
|
||||||
|
Ok(Self { file, log_path })
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log(&mut self, msg: &str) {
|
||||||
|
let line = format!("[{}] {}\n", Local::now().format("%H:%M:%S"), msg);
|
||||||
|
let _ = self.file.write_all(line.as_bytes());
|
||||||
|
let _ = self.file.flush();
|
||||||
|
println!("{}", line.trim_end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn init_debug_logger(log_dir: &std::path::Path) -> Result<(), String> {
|
||||||
|
let mut logger = LOGGER.lock().await;
|
||||||
|
match DebugLogger::new(log_dir) {
|
||||||
|
Ok(l) => {
|
||||||
|
let log_path = l.log_path.clone();
|
||||||
|
*logger = Some(l);
|
||||||
|
println!("✓ Logger initialized at: {:?}", log_path);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let err_msg = format!("Failed to initialize logger: {}", e);
|
||||||
|
eprintln!("{}", err_msg);
|
||||||
|
Err(err_msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_message(msg: &str) {
|
||||||
|
let mut logger = LOGGER.lock().await;
|
||||||
|
if let Some(l) = logger.as_mut() {
|
||||||
|
l.log(msg).await;
|
||||||
|
} else {
|
||||||
|
println!("[LOG] {}", msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_detailed(level: &str, msg: &str) {
|
||||||
|
let formatted = format!("[{}] {}", level, msg);
|
||||||
|
log_message(&formatted).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_info(msg: &str) {
|
||||||
|
log_detailed("INFO", msg).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_warn(msg: &str) {
|
||||||
|
log_detailed("WARN", msg).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_error(msg: &str) {
|
||||||
|
log_detailed("ERROR", msg).await;
|
||||||
|
}
|
||||||
28
src/util/macros.rs
Normal file
28
src/util/macros.rs
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
// src/macros.rs
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! check_shutdown {
|
||||||
|
($shutdown_flag:expr) => {
|
||||||
|
if $shutdown_flag.load(std::sync::atomic::Ordering::SeqCst) {
|
||||||
|
logger::log_warn("Shutdown detected, stopping processes").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark incomplete state on shutdown
|
||||||
|
/// Usage: mark_incomplete_on_shutdown!(&manager, "step_name", content_ref, DataStage::Data, &shutdown_flag)?;
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! mark_incomplete_on_shutdown {
|
||||||
|
($manager:expr, $step_name:expr, $content_ref:expr, $data_stage:expr, $shutdown_flag:expr) => {
|
||||||
|
if $shutdown_flag.load(std::sync::atomic::Ordering::SeqCst) {
|
||||||
|
$manager
|
||||||
|
.mark_incomplete(
|
||||||
|
$step_name.to_string(),
|
||||||
|
$content_ref,
|
||||||
|
$data_stage,
|
||||||
|
"Incomplete due to shutdown".to_string(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
6
src/util/mod.rs
Normal file
6
src/util/mod.rs
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
// src/util/mod.rs
|
||||||
|
pub mod logger;
|
||||||
|
pub mod directories;
|
||||||
|
pub mod opnv;
|
||||||
|
pub mod macros;
|
||||||
|
pub mod integrity;
|
||||||
281
src/util/opnv.rs
Normal file
281
src/util/opnv.rs
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
// src/scraper/opnv.rs
|
||||||
|
|
||||||
|
//! Module for fetching, downloading, and extracting OpenVPN configurations from VPNBook.
|
||||||
|
//!
|
||||||
|
//! This module provides functionality to scrape the VPNBook free VPN page using
|
||||||
|
//! a headless browser, handle potential consent popups, extract current credentials,
|
||||||
|
//! collect download URLs for OpenVPN ZIP files, download them, and then extract
|
||||||
|
//! the .ovpn files into a structured directory: cache/openvpn/<hostname>/<ovpn_filename>.
|
||||||
|
//! It is designed to fetch the most recent data on every run, as credentials and
|
||||||
|
//! server configurations change periodically.
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use fantoccini::{Client, Locator};
|
||||||
|
use reqwest;
|
||||||
|
use std::io::{Read};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use tokio::fs::File;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use url::Url;
|
||||||
|
use zip::ZipArchive;
|
||||||
|
use crate::scraper::webdriver::{ChromeDriverPool, ScrapeTask};
|
||||||
|
use crate::util::{directories::DataPaths};
|
||||||
|
|
||||||
|
/// Fetches, downloads, and extracts the latest OpenVPN configurations from VPNBook.
|
||||||
|
///
|
||||||
|
/// This asynchronous function uses the provided `ChromeDriverPool` to scrape the
|
||||||
|
/// VPNBook free VPN page. It dismisses any consent popup if present, extracts the
|
||||||
|
/// current username and password, collects all OpenVPN ZIP download URLs, downloads
|
||||||
|
/// the ZIP files temporarily, extracts the .ovpn files into the specified directory
|
||||||
|
/// structure under `cache_dir`/openvpn/<hostname>/, and cleans up the ZIP files.
|
||||||
|
///
|
||||||
|
/// The directory structure is: cache/openvpn/<hostname>/<ovpn_filename>, where
|
||||||
|
/// <hostname> is derived from the ZIP filename (e.g., "ca149.vpnbook.com").
|
||||||
|
///
|
||||||
|
/// The function ensures fresh data is fetched each time it runs, making it suitable
|
||||||
|
/// for periodic updates where credentials may change.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `pool` - A reference to the `ChromeDriverPool` for managing browser instances.
|
||||||
|
/// * `cache_dir` - The path to the base cache directory. The OpenVPN files will be saved
|
||||||
|
/// under `cache_dir`/openvpn/<hostname>/.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A `Result` containing a tuple with:
|
||||||
|
/// - `String`: The scraped username.
|
||||||
|
/// - `String`: The scraped password.
|
||||||
|
/// - `Vec<PathBuf>`: Paths to the extracted .ovpn files.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
///
|
||||||
|
/// Returns an `anyhow::Error` if:
|
||||||
|
/// - Navigation to the page fails.
|
||||||
|
/// - The consent popup cannot be dismissed (if present).
|
||||||
|
/// - Credentials cannot be parsed from the page.
|
||||||
|
/// - Download URLs cannot be found or are invalid.
|
||||||
|
/// - HTTP downloads fail or file writing errors occur.
|
||||||
|
/// - ZIP extraction fails (e.g., invalid ZIP or I/O errors).
|
||||||
|
///
|
||||||
|
/// # Dependencies
|
||||||
|
///
|
||||||
|
/// This function requires the following crates (add to Cargo.toml if not present):
|
||||||
|
/// - `anyhow` for error handling.
|
||||||
|
/// - `fantoccini` for browser automation.
|
||||||
|
/// - `reqwest` (with `tokio` features) for HTTP downloads.
|
||||||
|
/// - `tokio` for asynchronous file operations.
|
||||||
|
/// - `url` for URL manipulation.
|
||||||
|
/// - `zip` for ZIP extraction.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```no_run
|
||||||
|
/// use anyhow::Result;
|
||||||
|
/// use event_backtest_engine::scraper::opnv::fetch_vpnbook_configs;
|
||||||
|
/// use event_backtest_engine::scraper::webdriver::ChromeDriverPool;
|
||||||
|
/// use std::path::Path;
|
||||||
|
///
|
||||||
|
/// #[tokio::main]
|
||||||
|
/// async fn main() -> Result<()> {
|
||||||
|
/// let pool = ChromeDriverPool::new(1).await?;
|
||||||
|
/// let (username, password, files) =
|
||||||
|
/// fetch_vpnbook_configs(&pool, Path::new("./cache")).await?;
|
||||||
|
/// println!("Username: {}, Password: {}", username, password);
|
||||||
|
/// for file in files {
|
||||||
|
/// println!("Extracted: {:?}", file);
|
||||||
|
/// }
|
||||||
|
/// Ok(())
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
pub async fn fetch_vpnbook_configs(
|
||||||
|
pool: &ChromeDriverPool,
|
||||||
|
cache_dir: &Path,
|
||||||
|
) -> Result<(String, String, Vec<PathBuf>)> {
|
||||||
|
// Prepare the openvpn directory
|
||||||
|
let dir = DataPaths::new(".")?;
|
||||||
|
let vpn_dir = dir.cache_openvpn_dir();
|
||||||
|
tokio::fs::create_dir_all(&vpn_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to create openvpn directory")?;
|
||||||
|
|
||||||
|
// Temporary directory for ZIP downloads (under cache for consistency)
|
||||||
|
let temp_dir = cache_dir.join("temp_vpn_zips");
|
||||||
|
tokio::fs::create_dir_all(&temp_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to create temp directory")?;
|
||||||
|
|
||||||
|
let url = "https://www.vpnbook.com/freevpn".to_string();
|
||||||
|
|
||||||
|
// Define the scraping task
|
||||||
|
let task = ScrapeTask::new(url, |client: Client| async move {
|
||||||
|
// Attempt to dismiss consent popup if present
|
||||||
|
let consent_selector = r#"body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-do-not-consent.fc-secondary-button > p"#;
|
||||||
|
if let Ok(consent_elem) = client.find(Locator::Css(consent_selector)).await {
|
||||||
|
consent_elem
|
||||||
|
.click()
|
||||||
|
.await
|
||||||
|
.context("Failed to click consent dismissal button")?;
|
||||||
|
// Brief delay to allow popup to close
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find all <code> elements
|
||||||
|
let codes = client
|
||||||
|
.find_all(Locator::Css("code"))
|
||||||
|
.await
|
||||||
|
.context("Failed to find code elements")?;
|
||||||
|
|
||||||
|
if codes.len() < 2 {
|
||||||
|
return Err(anyhow!("Insufficient code elements found for credentials"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The first <code> is username, second is password
|
||||||
|
let username = codes[0]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.context("Failed to get username text")?;
|
||||||
|
|
||||||
|
let password = codes[1]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.context("Failed to get password text")?;
|
||||||
|
|
||||||
|
// Locate all download links for OpenVPN ZIP files
|
||||||
|
let links = client
|
||||||
|
.find_all(Locator::Css(r#"a[href^="/free-openvpn-account/"][download=""]"#))
|
||||||
|
.await
|
||||||
|
.context("Failed to find download links")?;
|
||||||
|
|
||||||
|
// Collect relative hrefs
|
||||||
|
let mut rel_urls = Vec::new();
|
||||||
|
for link in links {
|
||||||
|
if let Some(href) = link.attr("href").await.context("Failed to get href attribute")? {
|
||||||
|
rel_urls.push(href);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((username, password, rel_urls))
|
||||||
|
});
|
||||||
|
|
||||||
|
// Execute the scraping task using the pool
|
||||||
|
let (username, password, rel_urls) = task.execute_with_pool(pool).await?;
|
||||||
|
|
||||||
|
// Base URL for resolving relative paths
|
||||||
|
let base_url = Url::parse("https://www.vpnbook.com/")?;
|
||||||
|
|
||||||
|
// Download each ZIP file to temp_dir
|
||||||
|
let mut zip_paths = Vec::new();
|
||||||
|
for rel in &rel_urls {
|
||||||
|
let full_url = base_url.join(rel).context("Failed to join URL")?;
|
||||||
|
let filename = rel
|
||||||
|
.split('/')
|
||||||
|
.last()
|
||||||
|
.ok_or_else(|| anyhow!("Invalid filename in URL"))?
|
||||||
|
.to_string();
|
||||||
|
let out_path = temp_dir.join(&filename);
|
||||||
|
|
||||||
|
// Perform HTTP GET request
|
||||||
|
let resp = reqwest::get(full_url.clone())
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("Failed to send download request for {}", full_url))?;
|
||||||
|
|
||||||
|
if resp.status().is_success() {
|
||||||
|
let bytes = resp
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.context("Failed to read response bytes")?;
|
||||||
|
|
||||||
|
// Write to file asynchronously
|
||||||
|
let mut file = File::create(&out_path)
|
||||||
|
.await
|
||||||
|
.context("Failed to create output file")?;
|
||||||
|
file.write_all(&bytes)
|
||||||
|
.await
|
||||||
|
.context("Failed to write to file")?;
|
||||||
|
|
||||||
|
zip_paths.push(out_path);
|
||||||
|
} else {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Download failed with status: {} for URL: {}",
|
||||||
|
resp.status(),
|
||||||
|
full_url
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now extract .ovpn files from each ZIP
|
||||||
|
let mut extracted_paths = Vec::new();
|
||||||
|
for zip_path in zip_paths {
|
||||||
|
let hostname = get_hostname_from_zip_filename(
|
||||||
|
zip_path.file_name().unwrap().to_str().unwrap(),
|
||||||
|
);
|
||||||
|
let hostname_dir = vpn_dir.join(&hostname);
|
||||||
|
tokio::fs::create_dir_all(&hostname_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to create hostname directory")?;
|
||||||
|
|
||||||
|
// Use spawn_blocking for sync ZIP operations
|
||||||
|
let zip_path_clone = zip_path.clone();
|
||||||
|
let hostname_dir_clone = hostname_dir.clone();
|
||||||
|
let extract_result = tokio::task::spawn_blocking(move || {
|
||||||
|
let file = std::fs::File::open(&zip_path_clone)
|
||||||
|
.with_context(|| format!("Failed to open ZIP file: {:?}", zip_path_clone))?;
|
||||||
|
let mut archive = ZipArchive::new(file)
|
||||||
|
.with_context(|| format!("Failed to read ZIP archive: {:?}", zip_path_clone))?;
|
||||||
|
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
for i in 0..archive.len() {
|
||||||
|
let mut zip_file = archive.by_index(i)?;
|
||||||
|
if zip_file.name().ends_with(".ovpn") {
|
||||||
|
// Get just the filename, stripping any path
|
||||||
|
let file_name = Path::new(zip_file.name()).file_name()
|
||||||
|
.ok_or_else(|| anyhow!("Invalid file name in ZIP: {}", zip_file.name()))?
|
||||||
|
.to_str()
|
||||||
|
.ok_or_else(|| anyhow!("Invalid UTF-8 in file name: {}", zip_file.name()))?
|
||||||
|
.to_string();
|
||||||
|
let target_path = hostname_dir_clone.join(file_name);
|
||||||
|
let mut content = Vec::new();
|
||||||
|
zip_file.read_to_end(&mut content)?;
|
||||||
|
|
||||||
|
std::fs::write(&target_path, &content)
|
||||||
|
.with_context(|| format!("Failed to write .ovpn file: {:?}", target_path))?;
|
||||||
|
paths.push(target_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok::<Vec<PathBuf>, anyhow::Error>(paths)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.context("Spawn blocking failed")??;
|
||||||
|
|
||||||
|
extracted_paths.extend(extract_result);
|
||||||
|
|
||||||
|
// Clean up the ZIP file after extraction
|
||||||
|
tokio::fs::remove_file(&zip_path)
|
||||||
|
.await
|
||||||
|
.context("Failed to remove temp ZIP file")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optional: Clean up temp_dir if empty
|
||||||
|
let _ = tokio::fs::remove_dir(&temp_dir).await;
|
||||||
|
|
||||||
|
Ok((username, password, extracted_paths))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Derives the hostname from the ZIP filename.
|
||||||
|
///
|
||||||
|
/// For example, "vpnbook-openvpn-ca149.zip" -> "ca149.vpnbook.com"
|
||||||
|
///
|
||||||
|
/// If the format doesn't match, returns "unknown.vpnbook.com".
|
||||||
|
fn get_hostname_from_zip_filename(filename: &str) -> String {
|
||||||
|
if filename.starts_with("vpnbook-openvpn-") && filename.ends_with(".zip") {
|
||||||
|
let code = filename
|
||||||
|
.strip_prefix("vpnbook-openvpn-")
|
||||||
|
.unwrap()
|
||||||
|
.strip_suffix(".zip")
|
||||||
|
.unwrap();
|
||||||
|
format!("{}.vpnbook.com", code)
|
||||||
|
} else {
|
||||||
|
"unknown.vpnbook.com".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user