Compare commits
14 Commits
feature/br
...
1bda78897b
| Author | SHA1 | Date | |
|---|---|---|---|
| 1bda78897b | |||
| 470f0922ed | |||
| c9da56e8e9 | |||
| dde859b071 | |||
| 2416947e9d | |||
| 3ab5d0dcc3 | |||
| c2408d9a56 | |||
| f95e9e2427 | |||
| c00bfd8687 | |||
| 0f89c8c0ce | |||
| a6823dc938 | |||
| 58a498e694 | |||
| f7083bf9f0 | |||
| f05df0b5ee |
48
.env.example
Normal file
48
.env.example
Normal file
@@ -0,0 +1,48 @@
|
||||
# WebScraper Configuration File (.env)
|
||||
# ====================================
|
||||
# This file configures the behavior of the WebScraper application
|
||||
# Copy to .env and adjust values as needed
|
||||
|
||||
# ===== ECONOMIC DATA =====
|
||||
# Start date for economic event scraping
|
||||
ECONOMIC_START_DATE=2007-02-13
|
||||
|
||||
# How far into the future to look ahead for economic events (in months)
|
||||
ECONOMIC_LOOKAHEAD_MONTHS=3
|
||||
|
||||
# ===== CORPORATE DATA =====
|
||||
# Start date for corporate earnings/data scraping
|
||||
CORPORATE_START_DATE=2010-01-01
|
||||
|
||||
# ===== PERFORMANCE & CONCURRENCY =====
|
||||
# Maximum number of parallel ChromeDriver instances
|
||||
# Higher = more concurrent tasks, but higher resource usage
|
||||
MAX_PARALLEL_INSTANCES=3
|
||||
|
||||
# Maximum tasks per ChromeDriver instance before recycling
|
||||
# 0 = unlimited (instance lives for entire application runtime)
|
||||
MAX_TASKS_PER_INSTANCE=0
|
||||
|
||||
# ===== VPN ROTATION (ProtonVPN Integration) =====
|
||||
# Enable automatic VPN rotation between sessions?
|
||||
# If false, all traffic goes through system without VPN tunneling
|
||||
ENABLE_VPN_ROTATION=false
|
||||
|
||||
# Comma-separated list of ProtonVPN servers to rotate through
|
||||
# Examples:
|
||||
# "US-Free#1,US-Free#2,UK-Free#1"
|
||||
# "US,UK,JP,DE,NL"
|
||||
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
||||
VPN_SERVERS=
|
||||
|
||||
# Number of tasks per VPN session before rotating to new server/IP
|
||||
# 0 = rotate between economic and corporate phases (one phase = one IP)
|
||||
# 5 = rotate every 5 tasks
|
||||
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
||||
TASKS_PER_VPN_SESSION=0
|
||||
|
||||
# ===== LOGGING =====
|
||||
# Set via RUST_LOG environment variable:
|
||||
# RUST_LOG=info cargo run
|
||||
# RUST_LOG=debug cargo run
|
||||
# Leave empty or unset for default logging level
|
||||
21
.gitignore
vendored
21
.gitignore
vendored
@@ -27,10 +27,17 @@ target/
|
||||
|
||||
# /chromedriver-win64/*
|
||||
|
||||
# data folders
|
||||
/economic_events*
|
||||
/economic_event_changes*
|
||||
/corporate_events*
|
||||
/corporate_prices*
|
||||
/corporate_event_changes*
|
||||
/data*
|
||||
# data files
|
||||
**/*.json
|
||||
**/*.jsonl
|
||||
**/*.csv
|
||||
**/*.zip
|
||||
**/*.log
|
||||
**/*.ovpn
|
||||
|
||||
#/economic_events*
|
||||
#/economic_event_changes*
|
||||
#/corporate_events*
|
||||
#/corporate_prices*
|
||||
#/corporate_event_changes*
|
||||
#/data*
|
||||
63
Cargo.lock
generated
63
Cargo.lock
generated
@@ -671,16 +671,19 @@ dependencies = [
|
||||
"fantoccini",
|
||||
"flate2",
|
||||
"futures",
|
||||
"once_cell",
|
||||
"rand 0.9.2",
|
||||
"rayon",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"walkdir",
|
||||
"yfinance-rs",
|
||||
"zip",
|
||||
]
|
||||
@@ -2527,6 +2530,15 @@ version = "1.0.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.28"
|
||||
@@ -2672,15 +2684,6 @@ dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e24345aa0fe688594e73770a5f6d1b216508b4f93484c0026d521acd30134392"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_urlencoded"
|
||||
version = "0.7.1"
|
||||
@@ -3126,21 +3129,6 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0dc8b1fb61449e27716ec0e1bdf0f6b8f3e8f6b05391e8497b8b6d7804ea6d8"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"serde_core",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_parser",
|
||||
"toml_writer",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.7.3"
|
||||
@@ -3171,12 +3159,6 @@ dependencies = [
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_writer"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2"
|
||||
|
||||
[[package]]
|
||||
name = "tower"
|
||||
version = "0.5.2"
|
||||
@@ -3390,6 +3372,16 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
|
||||
dependencies = [
|
||||
"same-file",
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.1"
|
||||
@@ -3521,6 +3513,15 @@ dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.62.2"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "event_backtest_engine"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
edition = "2024"
|
||||
authors = ["Your Name <you@example.com>"]
|
||||
description = "High-impact economic & corporate earnings data collector for short-event backtesting (overnight/weekend gaps)"
|
||||
license = "MIT OR Apache-2.0"
|
||||
@@ -21,6 +21,7 @@ reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "
|
||||
scraper = "0.19" # HTML parsing for Yahoo earnings pages
|
||||
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
|
||||
yfinance-rs = "0.7.2"
|
||||
url = "2.5.7"
|
||||
|
||||
# Serialization
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
@@ -29,12 +30,15 @@ csv = "1.3"
|
||||
zip = "6.0.0"
|
||||
flate2 = "1.1.5"
|
||||
|
||||
# Formatting
|
||||
regex = "1.12.2"
|
||||
walkdir = "2"
|
||||
|
||||
# Generating
|
||||
rand = "0.9.2"
|
||||
|
||||
# Environment handling
|
||||
dotenvy = "0.15"
|
||||
toml = "0.9.8"
|
||||
|
||||
# Date & time
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
@@ -45,6 +49,7 @@ anyhow = "1.0"
|
||||
# Logging (optional but recommended)
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||
once_cell = "1.21.3"
|
||||
|
||||
# Parallel processing (for batch tickers)
|
||||
futures = "0.3"
|
||||
|
||||
15
cache/openfigi/INFO.md
vendored
Normal file
15
cache/openfigi/INFO.md
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# Openfigi Data
|
||||
|
||||
## Market Security Description
|
||||
| Code | Meaning |
|
||||
| ---------- | --------------------------------------------------------- |
|
||||
| **Comdty** | Commodity (e.g., oil, gold futures, physical commodities) |
|
||||
| **Corp** | Corporate bond / corporate debt security |
|
||||
| **Curncy** | Currency or FX pair (e.g., EURUSD) |
|
||||
| **Equity** | Stocks / shares |
|
||||
| **Govt** | Government bond (Treasuries, Bunds, Gilts, etc.) |
|
||||
| **Index** | Market indices (S&P 500, DAX, NYSE Composite…) |
|
||||
| **M-Mkt** | Money market instruments (commercial paper, CDs, T-bills) |
|
||||
| **Mtge** | Mortgage-backed securities (MBS) |
|
||||
| **Muni** | Municipal bonds (US state/local government debt) |
|
||||
| **Pfd** | Preferred shares |
|
||||
15
data/INFO.md
Normal file
15
data/INFO.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# Global Data Info
|
||||
|
||||
## Exchanges
|
||||
|
||||
Source: Wikipedia
|
||||
|
||||
## Gleif
|
||||
|
||||
Data Download [.zip] over Website
|
||||
|
||||
## OpenFigi
|
||||
|
||||
Data Scraping over open API
|
||||
|
||||
Api Key: .env
|
||||
6
data/economic/INFO.md
Normal file
6
data/economic/INFO.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Economic Info
|
||||
|
||||
## Sources
|
||||
|
||||
* continents: finanzen.net
|
||||
* countries: finanzen.net
|
||||
BIN
event_backtest_engine.exe
Normal file
BIN
event_backtest_engine.exe
Normal file
Binary file not shown.
@@ -12,21 +12,40 @@ pub struct Config {
|
||||
pub economic_lookahead_months: u32, // default: 3
|
||||
/// Maximum number of parallel scraping tasks (default: 10).
|
||||
/// This limits concurrency to protect system load and prevent website spamming.
|
||||
#[serde(default = "default_max_parallel")]
|
||||
pub max_parallel_tasks: usize,
|
||||
#[serde(default = "default_max_parallel_instances")]
|
||||
pub max_parallel_instances: usize,
|
||||
|
||||
pub max_tasks_per_instance: usize,
|
||||
|
||||
/// VPN rotation configuration
|
||||
/// If set to "true", enables automatic VPN rotation between sessions
|
||||
#[serde(default)]
|
||||
pub enable_vpn_rotation: bool,
|
||||
|
||||
/// Number of tasks per session before rotating VPN
|
||||
/// If set to 0, rotates VPN between economic and corporate phases
|
||||
#[serde(default = "default_tasks_per_session")]
|
||||
pub tasks_per_vpn_session: usize,
|
||||
}
|
||||
|
||||
fn default_max_parallel() -> usize {
|
||||
fn default_max_parallel_instances() -> usize {
|
||||
10
|
||||
}
|
||||
|
||||
fn default_tasks_per_session() -> usize {
|
||||
0 // 0 = rotate between economic/corporate
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
economic_start_date: "2007-02-13".to_string(),
|
||||
corporate_start_date: "2010-01-01".to_string(),
|
||||
economic_lookahead_months: 3,
|
||||
max_parallel_tasks: default_max_parallel(),
|
||||
max_parallel_instances: default_max_parallel_instances(),
|
||||
max_tasks_per_instance: 0,
|
||||
enable_vpn_rotation: false,
|
||||
tasks_per_vpn_session: default_tasks_per_session(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -59,16 +78,34 @@ impl Config {
|
||||
.parse()
|
||||
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
||||
|
||||
let max_parallel_tasks: usize = dotenvy::var("MAX_PARALLEL_TASKS")
|
||||
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
|
||||
.unwrap_or_else(|_| "10".to_string())
|
||||
.parse()
|
||||
.context("Failed to parse MAX_PARALLEL_TASKS as usize")?;
|
||||
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
|
||||
|
||||
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
|
||||
.unwrap_or_else(|_| "0".to_string())
|
||||
.parse()
|
||||
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
|
||||
|
||||
let enable_vpn_rotation = dotenvy::var("ENABLE_VPN_ROTATION")
|
||||
.unwrap_or_else(|_| "false".to_string())
|
||||
.parse::<bool>()
|
||||
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
||||
|
||||
let tasks_per_vpn_session: usize = dotenvy::var("TASKS_PER_VPN_SESSION")
|
||||
.unwrap_or_else(|_| "0".to_string())
|
||||
.parse()
|
||||
.context("Failed to parse TASKS_PER_VPN_SESSION as usize")?;
|
||||
|
||||
Ok(Self {
|
||||
economic_start_date,
|
||||
corporate_start_date,
|
||||
economic_lookahead_months,
|
||||
max_parallel_tasks,
|
||||
max_parallel_instances,
|
||||
max_tasks_per_instance,
|
||||
enable_vpn_rotation,
|
||||
tasks_per_vpn_session,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// src/corporate/aggregation.rs
|
||||
use super::types::CompanyPrice;
|
||||
use super::storage::*;
|
||||
use crate::util::directories::DataPaths;
|
||||
use tokio::fs;
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -16,8 +17,8 @@ struct DayData {
|
||||
}
|
||||
|
||||
/// Aggregate price data from multiple exchanges, converting all to USD
|
||||
pub async fn aggregate_best_price_data(lei: &str) -> anyhow::Result<()> {
|
||||
let company_dir = get_company_dir(lei);
|
||||
pub async fn aggregate_best_price_data(paths: &DataPaths, lei: &str) -> anyhow::Result<()> {
|
||||
let company_dir = get_company_dir(paths, lei);
|
||||
|
||||
for timeframe in ["daily", "5min"].iter() {
|
||||
let source_dir = company_dir.join(timeframe);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
// src/corporate/scraper.rs
|
||||
use super::{types::*, helpers::*, openfigi::*};
|
||||
//use crate::corporate::openfigi::OpenFigiClient;
|
||||
use crate::{scraper::webdriver::*};
|
||||
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
|
||||
use fantoccini::{Client, Locator};
|
||||
use scraper::{Html, Selector};
|
||||
use chrono::{DateTime, Duration, NaiveDate, Utc};
|
||||
@@ -15,160 +15,6 @@ use anyhow::{anyhow, Result};
|
||||
|
||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
||||
|
||||
/// Discover all exchanges where this ISIN trades by querying Yahoo Finance and enriching with OpenFIGI API calls.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `isin` - The ISIN to search for.
|
||||
/// * `known_ticker` - A known ticker symbol for fallback or initial check.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of FigiInfo structs containing enriched data from API calls.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if HTTP requests fail, JSON parsing fails, or OpenFIGI API responds with an error.
|
||||
pub async fn discover_available_exchanges(isin: &str, known_ticker: &str) -> anyhow::Result<Vec<FigiInfo>> {
|
||||
println!(" Discovering exchanges for ISIN {}", isin);
|
||||
|
||||
let mut potential: Vec<(String, PrimaryInfo)> = Vec::new();
|
||||
|
||||
// Try the primary ticker first
|
||||
if let Ok(info) = check_ticker_exists(known_ticker).await {
|
||||
potential.push((known_ticker.to_string(), info));
|
||||
}
|
||||
|
||||
// Search for ISIN directly on Yahoo to find other listings
|
||||
let search_url = format!(
|
||||
"https://query2.finance.yahoo.com/v1/finance/search?q={}"esCount=20&newsCount=0",
|
||||
isin
|
||||
);
|
||||
|
||||
let resp = HttpClient::new()
|
||||
.get(&search_url)
|
||||
.header("User-Agent", USER_AGENT)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let json = resp.json::<Value>().await?;
|
||||
|
||||
if let Some(quotes) = json["quotes"].as_array() {
|
||||
for quote in quotes {
|
||||
// First: filter by quoteType directly from search results (faster rejection)
|
||||
let quote_type = quote["quoteType"].as_str().unwrap_or("");
|
||||
if quote_type.to_uppercase() != "EQUITY" {
|
||||
continue; // Skip bonds, ETFs, mutual funds, options, etc.
|
||||
}
|
||||
|
||||
if let Some(symbol) = quote["symbol"].as_str() {
|
||||
// Avoid duplicates
|
||||
if potential.iter().any(|(s, _)| s == symbol) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Double-check with full quote data (some search results are misleading)
|
||||
if let Ok(info) = check_ticker_exists(symbol).await {
|
||||
potential.push((symbol.to_string(), info));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if potential.is_empty() {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
// Enrich with OpenFIGI API
|
||||
let client = OpenFigiClient::new()?;
|
||||
|
||||
let mut discovered_figis = Vec::new();
|
||||
|
||||
if !client.has_key() {
|
||||
// Fallback without API key - create FigiInfo with default/empty fields
|
||||
for (symbol, info) in potential {
|
||||
println!(" Found equity listing: {} on {} ({}) - no FIGI (fallback mode)", symbol, info.exchange_mic, info.currency);
|
||||
let figi_info = FigiInfo {
|
||||
isin: info.isin,
|
||||
figi: String::new(),
|
||||
name: info.name,
|
||||
ticker: symbol,
|
||||
mic_code: info.exchange_mic,
|
||||
currency: info.currency,
|
||||
compositeFIGI: String::new(),
|
||||
securityType: String::new(),
|
||||
marketSector: String::new(),
|
||||
shareClassFIGI: String::new(),
|
||||
securityType2: String::new(),
|
||||
securityDescription: String::new(),
|
||||
};
|
||||
discovered_figis.push(figi_info);
|
||||
}
|
||||
return Ok(discovered_figis);
|
||||
}
|
||||
|
||||
// With API key, batch the mapping requests
|
||||
let chunk_size = 100;
|
||||
for chunk in potential.chunks(chunk_size) {
|
||||
let mut jobs = vec![];
|
||||
for (symbol, info) in chunk {
|
||||
jobs.push(json!({
|
||||
"idType": "TICKER",
|
||||
"idValue": symbol,
|
||||
"micCode": info.exchange_mic,
|
||||
"marketSecDes": "Equity",
|
||||
}));
|
||||
}
|
||||
|
||||
let resp = client.get_figi_client()
|
||||
.post("https://api.openfigi.com/v3/mapping")
|
||||
.header("Content-Type", "application/json")
|
||||
.json(&jobs)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
return Err(anyhow::anyhow!("OpenFIGI mapping failed with status: {}", resp.status()));
|
||||
}
|
||||
|
||||
let parsed: Vec<Value> = resp.json().await?;
|
||||
|
||||
for (i, item) in parsed.iter().enumerate() {
|
||||
let (symbol, info) = &chunk[i];
|
||||
if let Some(data) = item["data"].as_array() {
|
||||
if let Some(entry) = data.first() {
|
||||
let market_sec = entry["marketSector"].as_str().unwrap_or("");
|
||||
if market_sec != "Equity" {
|
||||
continue;
|
||||
}
|
||||
println!(" Found equity listing: {} on {} ({}) - FIGI: {}", symbol, info.exchange_mic, info.currency, entry["figi"]);
|
||||
let figi_info = FigiInfo {
|
||||
isin: info.isin.clone(),
|
||||
figi: entry["figi"].as_str().unwrap_or("").to_string(),
|
||||
name: entry["name"].as_str().unwrap_or(&info.name).to_string(),
|
||||
ticker: symbol.clone(),
|
||||
mic_code: info.exchange_mic.clone(),
|
||||
currency: info.currency.clone(),
|
||||
compositeFIGI: entry["compositeFIGI"].as_str().unwrap_or("").to_string(),
|
||||
securityType: entry["securityType"].as_str().unwrap_or("").to_string(),
|
||||
marketSector: market_sec.to_string(),
|
||||
shareClassFIGI: entry["shareClassFIGI"].as_str().unwrap_or("").to_string(),
|
||||
securityType2: entry["securityType2"].as_str().unwrap_or("").to_string(),
|
||||
securityDescription: entry["securityDescription"].as_str().unwrap_or("").to_string(),
|
||||
};
|
||||
discovered_figis.push(figi_info);
|
||||
} else {
|
||||
println!(" No data returned for ticker {} on MIC {}", symbol, info.exchange_mic);
|
||||
}
|
||||
} else if let Some(error) = item["error"].as_str() {
|
||||
println!(" OpenFIGI error for ticker {}: {}", symbol, error);
|
||||
}
|
||||
}
|
||||
|
||||
// Respect rate limit (6 seconds between requests with key)
|
||||
sleep(TokioDuration::from_secs(6)).await;
|
||||
}
|
||||
|
||||
Ok(discovered_figis)
|
||||
}
|
||||
|
||||
/// Check if a ticker exists on Yahoo Finance and return core metadata.
|
||||
///
|
||||
/// This function calls the public Yahoo Finance quoteSummary endpoint and extracts:
|
||||
@@ -190,7 +36,7 @@ pub async fn discover_available_exchanges(isin: &str, known_ticker: &str) -> any
|
||||
/// - Not an equity (ETF, bond, etc.)
|
||||
/// - Missing critical fields
|
||||
/// - Network or JSON parsing errors
|
||||
pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
||||
/*pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
||||
let url = format!(
|
||||
"https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}?modules=price%2CassetProfile",
|
||||
ticker
|
||||
@@ -303,34 +149,7 @@ pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
||||
exchange_mic,
|
||||
currency,
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert Yahoo's exchange name to MIC code (best effort)
|
||||
fn exchange_name_to_mic(name: &str) -> String {
|
||||
match name {
|
||||
"NMS" | "NasdaqGS" | "NASDAQ" => "XNAS",
|
||||
"NYQ" | "NYSE" => "XNYS",
|
||||
"LSE" | "London" => "XLON",
|
||||
"FRA" | "Frankfurt" | "GER" | "XETRA" => "XFRA",
|
||||
"PAR" | "Paris" => "XPAR",
|
||||
"AMS" | "Amsterdam" => "XAMS",
|
||||
"MIL" | "Milan" => "XMIL",
|
||||
"JPX" | "Tokyo" => "XJPX",
|
||||
"HKG" | "Hong Kong" => "XHKG",
|
||||
"SHH" | "Shanghai" => "XSHG",
|
||||
"SHZ" | "Shenzhen" => "XSHE",
|
||||
"TOR" | "Toronto" => "XTSE",
|
||||
"ASX" | "Australia" => "XASX",
|
||||
"SAU" | "Saudi" => "XSAU",
|
||||
"SWX" | "Switzerland" => "XSWX",
|
||||
"BSE" | "Bombay" => "XBSE",
|
||||
"NSE" | "NSI" => "XNSE",
|
||||
"TAI" | "Taiwan" => "XTAI",
|
||||
"SAO" | "Sao Paulo" => "BVMF",
|
||||
"MCE" | "Madrid" => "XMAD",
|
||||
_ => name, // Fallback to name itself
|
||||
}.to_string()
|
||||
}
|
||||
}*/
|
||||
|
||||
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
||||
///
|
||||
@@ -670,60 +489,164 @@ pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow
|
||||
|
||||
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
||||
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
||||
let zip_path = "data/gleif/isin_lei.zip";
|
||||
let csv_path = "data/gleif/isin_lei.csv";
|
||||
|
||||
if let Err(e) = std::fs::create_dir_all("data") {
|
||||
println!("Failed to create data directory: {e}");
|
||||
// Initialize DataPaths and create cache/gleif directory
|
||||
let paths = DataPaths::new(".")?;
|
||||
let gleif_cache_dir = paths.cache_gleif_dir();
|
||||
|
||||
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
|
||||
let msg = format!("Failed to create cache/gleif directory: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Download ZIP
|
||||
let bytes = match reqwest::Client::builder()
|
||||
logger::log_info("Corporate Scraper: Downloading ISIN/LEI mapping from GLEIF...").await;
|
||||
|
||||
// Download ZIP and get the filename from Content-Disposition header
|
||||
let client = match reqwest::Client::builder()
|
||||
.user_agent(USER_AGENT)
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.build()
|
||||
.and_then(|c| Ok(c))
|
||||
{
|
||||
Ok(client) => match client.get(url).send().await {
|
||||
Ok(resp) if resp.status().is_success() => match resp.bytes().await {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
println!("Failed to read ZIP bytes: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
},
|
||||
Ok(resp) => {
|
||||
println!("Server returned HTTP {}", resp.status());
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Failed to download ISIN/LEI ZIP: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
},
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
println!("Failed to create HTTP client: {e}");
|
||||
let msg = format!("Failed to create HTTP client: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = tokio::fs::write(zip_path, &bytes).await {
|
||||
println!("Failed to write ZIP file: {e}");
|
||||
return Ok(None);
|
||||
let resp = match client.get(url).send().await {
|
||||
Ok(r) if r.status().is_success() => r,
|
||||
Ok(resp) => {
|
||||
let msg = format!("Server returned HTTP {}", resp.status());
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = format!("Failed to download ISIN/LEI ZIP: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
// Extract filename from Content-Disposition header or use default
|
||||
let filename = resp
|
||||
.headers()
|
||||
.get("content-disposition")
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
|
||||
.unwrap_or_else(|| "isin_lei.zip".to_string());
|
||||
|
||||
// Parse timestamp from filename and convert to DDMMYYYY format
|
||||
let parsed_filename = parse_gleif_filename(&filename);
|
||||
logger::log_info(&format!("Corporate Scraper: Downloaded file: {} -> {}", filename, parsed_filename)).await;
|
||||
|
||||
// Determine date (DDMMYYYY) from parsed filename: "isin-lei-DDMMYYYY.csv"
|
||||
let mut date_str = String::new();
|
||||
if let Some(start_idx) = parsed_filename.find("isin-lei-") {
|
||||
let rest = &parsed_filename[start_idx + 9..];
|
||||
if rest.len() >= 8 {
|
||||
date_str = rest[0..8].to_string();
|
||||
}
|
||||
}
|
||||
|
||||
// If we parsed a date, use/create a date folder under cache/gleif and operate inside it; otherwise use cache root.
|
||||
let date_dir = if !date_str.is_empty() {
|
||||
let p = gleif_cache_dir.join(&date_str);
|
||||
// Ensure the date folder exists (create if necessary)
|
||||
if let Err(e) = std::fs::create_dir_all(&p) {
|
||||
let msg = format!("Failed to create date directory {:?}: {}", p, e);
|
||||
logger::log_warn(&msg).await;
|
||||
None
|
||||
} else {
|
||||
Some(p)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Choose the directory where we'll look for existing files and where we'll save the new ones
|
||||
let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());
|
||||
|
||||
// If the date folder exists (or was created), prefer any *_clean.csv inside it and return that immediately
|
||||
if let Some(ref ddir) = date_dir {
|
||||
if let Ok(entries) = std::fs::read_dir(ddir) {
|
||||
for entry in entries.flatten() {
|
||||
if let Some(name) = entry.file_name().to_str() {
|
||||
if name.to_lowercase().ends_with("_clean.csv") {
|
||||
let path = ddir.join(name);
|
||||
logger::log_info(&format!("Found existing clean GLEIF CSV: {}", path.display())).await;
|
||||
return Ok(Some(path.to_string_lossy().to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no clean file found in the date folder (or date folder doesn't exist), check whether the csv/zip already exist in the target dir
|
||||
let csv_candidate_name = parsed_filename.replace(".zip", ".csv");
|
||||
let csv_candidate = target_dir.join(&csv_candidate_name);
|
||||
let zip_candidate = target_dir.join(&parsed_filename);
|
||||
|
||||
if csv_candidate.exists() {
|
||||
logger::log_info(&format!("Found existing GLEIF CSV: {}", csv_candidate.display())).await;
|
||||
return Ok(Some(csv_candidate.to_string_lossy().to_string()));
|
||||
}
|
||||
if zip_candidate.exists() {
|
||||
// If zip exists but csv does not, extract later; for now prefer returning csv path (may be created by extraction step)
|
||||
let inferred_csv = target_dir.join(csv_candidate_name);
|
||||
if inferred_csv.exists() {
|
||||
logger::log_info(&format!("Found existing extracted CSV next to ZIP: {}", inferred_csv.display())).await;
|
||||
return Ok(Some(inferred_csv.to_string_lossy().to_string()));
|
||||
}
|
||||
// otherwise we'll overwrite/extract into target_dir below
|
||||
}
|
||||
|
||||
let bytes = match resp.bytes().await {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
let msg = format!("Failed to read ZIP bytes: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
// Ensure target directory exists (create if it's the date folder and was absent earlier)
|
||||
if let Some(ref ddir) = date_dir {
|
||||
let _ = std::fs::create_dir_all(ddir);
|
||||
}
|
||||
|
||||
let zip_path = target_dir.join(&parsed_filename);
|
||||
let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
||||
|
||||
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
|
||||
let msg = format!("Failed to write ZIP file: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
logger::log_info(&format!("Corporate Scraper: Saved ZIP to {:?}", zip_path)).await;
|
||||
|
||||
// Extract CSV
|
||||
let archive = match std::fs::File::open(zip_path)
|
||||
let archive = match std::fs::File::open(&zip_path)
|
||||
.map(ZipArchive::new)
|
||||
{
|
||||
Ok(Ok(a)) => a,
|
||||
Ok(Err(e)) => {
|
||||
println!("Invalid ZIP: {e}");
|
||||
let msg = format!("Invalid ZIP: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Cannot open ZIP file: {e}");
|
||||
let msg = format!("Cannot open ZIP file: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
@@ -737,7 +660,9 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
||||
}) {
|
||||
Some(i) => i,
|
||||
None => {
|
||||
println!("ZIP did not contain a CSV file");
|
||||
let msg = "ZIP did not contain a CSV file";
|
||||
logger::log_error(msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
@@ -745,23 +670,55 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
||||
let mut csv_file = match archive.by_index(idx) {
|
||||
Ok(f) => f,
|
||||
Err(e) => {
|
||||
println!("Failed to read CSV entry: {e}");
|
||||
let msg = format!("Failed to read CSV entry: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
println!("{}", msg);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let mut csv_bytes = Vec::new();
|
||||
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
||||
println!("Failed to extract CSV: {e}");
|
||||
let msg = format!("Failed to extract CSV: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if let Err(e) = tokio::fs::write(csv_path, &csv_bytes).await {
|
||||
println!("Failed to save CSV file: {e}");
|
||||
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
|
||||
let msg = format!("Failed to save CSV file: {}", e);
|
||||
logger::log_error(&msg).await;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(csv_path.to_string()))
|
||||
let msg = format!("✓ ISIN/LEI CSV extracted: {:?}", csv_path);
|
||||
logger::log_info(&msg).await;
|
||||
|
||||
Ok(Some(csv_path.to_string_lossy().to_string()))
|
||||
}
|
||||
|
||||
/// Parse GLEIF filename and convert timestamp to DDMMYYYY format
|
||||
/// Example: "isin-lei-20251124T080254.csv" -> "isin-lei-24112025.csv"
|
||||
fn parse_gleif_filename(filename: &str) -> String {
|
||||
// Try to find pattern: isin-lei-YYYYMMDDTHHMMSS.zip/csv
|
||||
if let Some(start_idx) = filename.find("isin-lei-") {
|
||||
let rest = &filename[start_idx + 9..]; // After "isin-lei-"
|
||||
|
||||
// Extract the 8 digits (YYYYMMDD)
|
||||
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
|
||||
let date_part = &rest[0..8];
|
||||
// date_part is YYYYMMDD, convert to DDMMYYYY
|
||||
if date_part.len() == 8 {
|
||||
let year = &date_part[0..4];
|
||||
let month = &date_part[4..6];
|
||||
let day = &date_part[6..8];
|
||||
let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" };
|
||||
return format!("isin-lei-{}{}{}{}", day, month, year, extension);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: return original filename if parsing fails
|
||||
filename.to_string()
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,20 +1,24 @@
|
||||
// src/corporate/storage.rs
|
||||
use super::{types::*, helpers::*};
|
||||
use crate::config;
|
||||
use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use chrono::{Datelike, NaiveDate};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::collections::{HashMap};
|
||||
use std::path::{PathBuf};
|
||||
|
||||
pub async fn load_existing_events() -> anyhow::Result<HashMap<String, CompanyEvent>> {
|
||||
pub async fn load_existing_events(paths: &DataPaths) -> anyhow::Result<HashMap<String, CompanyEvent>> {
|
||||
let mut map = HashMap::new();
|
||||
let dir = std::path::Path::new("corporate_events");
|
||||
let dir = paths.corporate_events_dir();
|
||||
if !dir.exists() {
|
||||
logger::log_info("Corporate Storage: No existing events directory found").await;
|
||||
return Ok(map);
|
||||
}
|
||||
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
let mut loaded_count = 0;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||
@@ -25,25 +29,32 @@ pub async fn load_existing_events() -> anyhow::Result<HashMap<String, CompanyEve
|
||||
for event in events {
|
||||
map.insert(event_key(&event), event);
|
||||
}
|
||||
loaded_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Corporate Storage: Loaded {} events from {} files", map.len(), loaded_count)).await;
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_events(events: HashMap<String, CompanyEvent>) -> anyhow::Result<()> {
|
||||
let dir = std::path::Path::new("corporate_events");
|
||||
pub async fn save_optimized_events(paths: &DataPaths, events: HashMap<String, CompanyEvent>) -> anyhow::Result<()> {
|
||||
let dir = paths.corporate_events_dir();
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
logger::log_info("Corporate Storage: Removing old event files...").await;
|
||||
let mut removed_count = 0;
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
fs::remove_file(&path).await?;
|
||||
removed_count += 1;
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Corporate Storage: Removed {} old event files", removed_count)).await;
|
||||
|
||||
let total_events = events.len();
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| (e.ticker.clone(), e.date.clone()));
|
||||
|
||||
@@ -55,18 +66,26 @@ pub async fn save_optimized_events(events: HashMap<String, CompanyEvent>) -> any
|
||||
}
|
||||
}
|
||||
|
||||
let total_months = by_month.len();
|
||||
for (month, list) in by_month {
|
||||
let path = dir.join(format!("events_{}.json", month));
|
||||
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
||||
logger::log_info(&format!("Corporate Storage: Saved {} events for month {}", list.len(), month)).await;
|
||||
}
|
||||
logger::log_info(&format!("Corporate Storage: Saved {} total events in {} month files", total_events, total_months)).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_changes(changes: &[CompanyEventChange]) -> anyhow::Result<()> {
|
||||
if changes.is_empty() { return Ok(()); }
|
||||
let dir = std::path::Path::new("corporate_event_changes");
|
||||
pub async fn save_changes(paths: &DataPaths, changes: &[CompanyEventChange]) -> anyhow::Result<()> {
|
||||
if changes.is_empty() {
|
||||
logger::log_info("Corporate Storage: No changes to save").await;
|
||||
return Ok(());
|
||||
}
|
||||
let dir = paths.corporate_changes_dir();
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
logger::log_info(&format!("Corporate Storage: Saving {} changes", changes.len())).await;
|
||||
|
||||
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
|
||||
for c in changes {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||
@@ -81,14 +100,16 @@ pub async fn save_changes(changes: &[CompanyEventChange]) -> anyhow::Result<()>
|
||||
let s = fs::read_to_string(&path).await?;
|
||||
serde_json::from_str(&s).unwrap_or_default()
|
||||
} else { vec![] };
|
||||
all.extend(list);
|
||||
all.extend(list.clone());
|
||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||
logger::log_info(&format!("Corporate Storage: Saved {} changes for month {}", list.len(), month)).await;
|
||||
}
|
||||
logger::log_info("Corporate Storage: All changes saved successfully").await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_prices_for_ticker(ticker: &str, timeframe: &str, mut prices: Vec<CompanyPrice>) -> anyhow::Result<()> {
|
||||
let base_dir = Path::new("corporate_prices");
|
||||
pub async fn save_prices_for_ticker(paths: &DataPaths, ticker: &str, timeframe: &str, mut prices: Vec<CompanyPrice>) -> anyhow::Result<()> {
|
||||
let base_dir = paths.corporate_prices_dir();
|
||||
let company_dir = base_dir.join(ticker.replace(".", "_"));
|
||||
let timeframe_dir = company_dir.join(timeframe);
|
||||
|
||||
@@ -102,35 +123,35 @@ pub async fn save_prices_for_ticker(ticker: &str, timeframe: &str, mut prices: V
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_company_dir(lei: &str) -> PathBuf {
|
||||
PathBuf::from("corporate_prices").join(lei)
|
||||
pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf {
|
||||
paths.corporate_prices_dir().join(lei)
|
||||
}
|
||||
|
||||
pub async fn ensure_company_dirs(isin: &str) -> anyhow::Result<()> {
|
||||
let base = get_company_dir(isin);
|
||||
let paths = [
|
||||
pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Result<()> {
|
||||
let base = get_company_dir(paths, isin);
|
||||
let paths_to_create = [
|
||||
base.clone(),
|
||||
base.join("5min"),
|
||||
base.join("daily"),
|
||||
base.join("aggregated").join("5min"),
|
||||
base.join("aggregated").join("daily"),
|
||||
];
|
||||
for p in paths {
|
||||
for p in paths_to_create {
|
||||
fs::create_dir_all(&p).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_available_exchanges(isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
|
||||
let dir = get_company_dir(isin);
|
||||
pub async fn save_available_exchanges(paths: &DataPaths, isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
|
||||
let dir = get_company_dir(paths, isin);
|
||||
fs::create_dir_all(&dir).await?;
|
||||
let path = dir.join("available_exchanges.json");
|
||||
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn load_available_exchanges(lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
|
||||
let path = get_company_dir(lei).join("available_exchanges.json");
|
||||
pub async fn load_available_exchanges(paths: &DataPaths, lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
|
||||
let path = get_company_dir(paths, lei).join("available_exchanges.json");
|
||||
if path.exists() {
|
||||
let content = fs::read_to_string(&path).await?;
|
||||
Ok(serde_json::from_str(&content)?)
|
||||
@@ -140,13 +161,14 @@ pub async fn load_available_exchanges(lei: &str) -> anyhow::Result<Vec<Available
|
||||
}
|
||||
|
||||
pub async fn save_prices_by_source(
|
||||
paths: &DataPaths,
|
||||
lei: &str,
|
||||
source_ticker: &str,
|
||||
timeframe: &str,
|
||||
prices: Vec<CompanyPrice>,
|
||||
) -> anyhow::Result<()> {
|
||||
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
|
||||
let dir = get_company_dir(lei).join(timeframe).join(&source_safe);
|
||||
let dir = get_company_dir(paths, lei).join(timeframe).join(&source_safe);
|
||||
fs::create_dir_all(&dir).await?;
|
||||
let path = dir.join("prices.json");
|
||||
let mut prices = prices;
|
||||
@@ -156,14 +178,15 @@ pub async fn save_prices_by_source(
|
||||
}
|
||||
|
||||
/// Update available_exchanges.json with fetch results
|
||||
pub async fn update_available_exchange(
|
||||
/*pub async fn update_available_exchange(
|
||||
paths: &DataPaths,
|
||||
isin: &str,
|
||||
ticker: &str,
|
||||
exchange_mic: &str,
|
||||
has_daily: bool,
|
||||
has_5min: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut exchanges = load_available_exchanges(isin).await?;
|
||||
let mut exchanges = load_available_exchanges(paths, isin).await?;
|
||||
|
||||
if let Some(entry) = exchanges.iter_mut().find(|e| e.ticker == ticker) {
|
||||
// Update existing entry
|
||||
@@ -181,39 +204,8 @@ pub async fn update_available_exchange(
|
||||
exchanges.push(new_entry);
|
||||
}
|
||||
|
||||
save_available_exchanges(isin, exchanges).await
|
||||
}
|
||||
|
||||
/// Add a newly discovered exchange before fetching
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `isin` - The ISIN associated with the exchange.
|
||||
/// * `figi_info` - The FigiInfo containing ticker, mic_code, and currency.
|
||||
///
|
||||
/// # Returns
|
||||
/// Ok(()) on success.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if loading or saving available exchanges fails.
|
||||
pub async fn add_discovered_exchange(
|
||||
isin: &str,
|
||||
figi_info: &FigiInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut exchanges = load_available_exchanges(isin).await?;
|
||||
|
||||
// Only add if not already present
|
||||
if !exchanges.iter().any(|e| e.ticker == figi_info.ticker && e.exchange_mic == figi_info.mic_code) {
|
||||
let new_entry = AvailableExchange::new(
|
||||
figi_info.ticker.clone(),
|
||||
figi_info.mic_code.clone(),
|
||||
figi_info.currency.clone(),
|
||||
);
|
||||
exchanges.push(new_entry);
|
||||
save_available_exchanges(isin, exchanges).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
save_available_exchanges(paths, isin, exchanges).await
|
||||
}*/
|
||||
|
||||
/// Infer currency from ticker suffix
|
||||
fn infer_currency_from_ticker(ticker: &str) -> String {
|
||||
@@ -234,4 +226,42 @@ fn infer_currency_from_ticker(ticker: &str) -> String {
|
||||
if ticker.ends_with(".BO") || ticker.ends_with(".NS") { return "INR".to_string(); }
|
||||
|
||||
"USD".to_string() // Default
|
||||
}
|
||||
|
||||
/// Saves companies data to a JSONL file.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `paths` - Reference to DataPaths for directory management
|
||||
/// * `companies` - HashMap of company names to their securities (ISIN, Ticker pairs)
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if file operations or serialization fails.
|
||||
pub async fn save_companies_to_jsonl(
|
||||
paths: &DataPaths,
|
||||
companies: &HashMap<String, HashMap<String, String>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let file_path = paths.data_dir().join("companies.jsonl");
|
||||
|
||||
logger::log_info(&format!("Corporate Storage: Saving {} companies to JSONL", companies.len())).await;
|
||||
|
||||
// Create parent directory if it doesn't exist
|
||||
if let Some(parent) = file_path.parent() {
|
||||
tokio::fs::create_dir_all(parent).await?;
|
||||
}
|
||||
|
||||
let mut file = tokio::fs::File::create(&file_path).await?;
|
||||
|
||||
for (name, securities) in companies.iter() {
|
||||
let line = serde_json::json!({
|
||||
"name": name,
|
||||
"securities": securities
|
||||
});
|
||||
file.write_all(line.to_string().as_bytes()).await?;
|
||||
file.write_all(b"\n").await?;
|
||||
}
|
||||
|
||||
let msg = format!("✓ Saved {} companies to {:?}", companies.len(), file_path);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
// src/corporate/types.rs
|
||||
use std::collections::HashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
@@ -53,24 +52,19 @@ pub struct FigiInfo {
|
||||
pub figi: String,
|
||||
pub name: String,
|
||||
pub ticker: String,
|
||||
pub mic_code: String,
|
||||
pub currency: String,
|
||||
pub compositeFIGI: String,
|
||||
pub securityType: String,
|
||||
pub marketSector: String,
|
||||
pub shareClassFIGI: String,
|
||||
pub securityType2: String,
|
||||
pub securityDescription: String,
|
||||
}
|
||||
|
||||
/// Company Meta Data
|
||||
/// # Attributes
|
||||
/// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
||||
/// * figi: metadata with ISIN as key
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompanyMetadata {
|
||||
pub lei: String,
|
||||
pub figi: Option<Vec<FigiInfo>>,
|
||||
pub exch_code: String,
|
||||
#[serde(rename = "compositeFIGI")]
|
||||
pub composite_figi: String,
|
||||
#[serde(rename = "securityType")]
|
||||
pub security_type: String,
|
||||
#[serde(rename = "marketSector")]
|
||||
pub market_sector: String,
|
||||
#[serde(rename = "shareClassFIGI")]
|
||||
pub share_class_figi: String,
|
||||
#[serde(rename = "securityType2")]
|
||||
pub security_type2: String,
|
||||
#[serde(rename = "securityDescription")]
|
||||
pub security_description: String,
|
||||
}
|
||||
|
||||
/// Company Info
|
||||
@@ -85,6 +79,15 @@ pub struct CompanyInfo{
|
||||
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
||||
}
|
||||
|
||||
/// Company Meta Data
|
||||
/// # Attributes
|
||||
/// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
||||
/// * figi: metadata with ISIN as key
|
||||
/*#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompanyMetadata {
|
||||
pub lei: String,
|
||||
pub figi: Option<Vec<FigiInfo>>,
|
||||
}*/
|
||||
|
||||
/// Warrant Info
|
||||
///
|
||||
@@ -115,13 +118,13 @@ pub struct OptionInfo {
|
||||
pub options: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
/*#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PrimaryInfo {
|
||||
pub isin: String,
|
||||
pub name: String,
|
||||
pub exchange_mic: String,
|
||||
pub currency: String,
|
||||
}
|
||||
}*/
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AvailableExchange {
|
||||
@@ -136,28 +139,4 @@ pub struct AvailableExchange {
|
||||
pub discovered_at: Option<String>, // When this exchange was first discovered
|
||||
#[serde(default)]
|
||||
pub fetch_count: u32, // How many times successfully fetched
|
||||
}
|
||||
|
||||
impl AvailableExchange {
|
||||
pub fn new(ticker: String, exchange_mic: String, currency: String) -> Self {
|
||||
Self {
|
||||
exchange_mic,
|
||||
ticker,
|
||||
has_daily: false,
|
||||
has_5min: false,
|
||||
last_successful_fetch: None,
|
||||
currency,
|
||||
discovered_at: Some(chrono::Local::now().format("%Y-%m-%d").to_string()),
|
||||
fetch_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_success(&mut self, has_daily: bool, has_5min: bool) {
|
||||
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
|
||||
|
||||
self.has_daily |= has_daily;
|
||||
self.has_5min |= has_5min;
|
||||
self.last_successful_fetch = Some(today);
|
||||
self.fetch_count += 1;
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
// src/corporate/update.rs
|
||||
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
|
||||
use crate::config::Config;
|
||||
use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
use crate::scraper::webdriver::ChromeDriverPool;
|
||||
|
||||
use chrono::Local;
|
||||
@@ -24,50 +26,109 @@ use std::sync::Arc;
|
||||
/// # Errors
|
||||
/// Returns an error if any step in the update process fails.
|
||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
||||
println!("=== Starting LEI-based corporate full update ===");
|
||||
let msg = "=== Starting LEI-based corporate full update ===";
|
||||
println!("{}", msg);
|
||||
logger::log_info(msg).await;
|
||||
|
||||
// Initialize paths
|
||||
let paths = DataPaths::new(".")?;
|
||||
|
||||
// 1. Load fresh GLEIF ISIN ↔ LEI mapping
|
||||
logger::log_info("Corporate Update: Loading GLEIF ISIN ↔ LEI mapping...").await;
|
||||
let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
|
||||
Ok(map) => map,
|
||||
Ok(map) => {
|
||||
let msg = format!("Corporate Update: Loaded GLEIF mapping with {} LEI entries", map.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
map
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Warning: Could not load GLEIF ISIN↔LEI mapping: {}", e);
|
||||
let msg = format!("Corporate Update: Warning - Could not load GLEIF ISIN↔LEI mapping: {}", e);
|
||||
eprintln!("{}", msg);
|
||||
logger::log_warn(&msg).await;
|
||||
HashMap::new()
|
||||
}
|
||||
};
|
||||
|
||||
// 2. Load OpenFIGI mapping value lists (cached)
|
||||
logger::log_info("Corporate Update: Loading OpenFIGI type lists...").await;
|
||||
if let Err(e) = load_figi_type_lists().await {
|
||||
eprintln!("Warning: Could not load OpenFIGI type lists: {}", e);
|
||||
let msg = format!("Corporate Update: Warning - Could not load OpenFIGI type lists: {}", e);
|
||||
eprintln!("{}", msg);
|
||||
logger::log_warn(&msg).await;
|
||||
}
|
||||
logger::log_info("Corporate Update: OpenFIGI type lists loaded").await;
|
||||
|
||||
// 3. Build FIGI → LEI map
|
||||
// # Attributes
|
||||
// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
||||
// * figi: metadata with ISIN as key
|
||||
let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins).await {
|
||||
Ok(map) => map,
|
||||
logger::log_info("Corporate Update: Building FIGI → LEI map...").await;
|
||||
let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins, None).await {
|
||||
Ok(map) => {
|
||||
let msg = format!("Corporate Update: Built FIGI map with {} entries", map.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
map
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Warning: Could not build FIGI→LEI map: {}", e);
|
||||
let msg = format!("Corporate Update: Warning - Could not build FIGI→LEI map: {}", e);
|
||||
eprintln!("{}", msg);
|
||||
logger::log_warn(&msg).await;
|
||||
HashMap::new()
|
||||
}
|
||||
};
|
||||
|
||||
// 4. Load or build companies
|
||||
let mut companies = load_or_build_all_securities(&figi_to_lei).await?;
|
||||
println!("Processing {} companies", companies.0.len());
|
||||
logger::log_info("Corporate Update: Loading/building company securities...").await;
|
||||
let securities = load_or_build_all_securities(&figi_to_lei).await?;
|
||||
let msg = format!("Corporate Update: Processing {} companies", securities.0.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
|
||||
// HashMap<Name, HashMap<ISIN, Ticker>> - unique pairs only
|
||||
let companies: HashMap<String, HashMap<String, String>> = securities.0
|
||||
.iter()
|
||||
.fold(HashMap::new(), |mut acc, security| {
|
||||
let mut isin_ticker_pairs: HashMap<String, String> = HashMap::new();
|
||||
|
||||
// Collect all unique ISIN-Ticker pairs
|
||||
for figi_infos in security.1.securities.values() {
|
||||
for figi_info in figi_infos {
|
||||
if !figi_info.isin.is_empty() && !figi_info.ticker.is_empty() {
|
||||
isin_ticker_pairs.insert(figi_info.isin.clone(), figi_info.ticker.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only add if there are pairs
|
||||
if !isin_ticker_pairs.is_empty() {
|
||||
acc.insert(security.1.name.clone(), isin_ticker_pairs);
|
||||
}
|
||||
acc
|
||||
});
|
||||
|
||||
logger::log_info(&format!("Corporate Update: Saving {} companies to JSONL", companies.len())).await;
|
||||
save_companies_to_jsonl(&paths, &companies).await.expect("Failed to save companies List.");
|
||||
logger::log_info("Corporate Update: Companies saved successfully").await;
|
||||
|
||||
// 5. Load existing earnings events (for change detection)
|
||||
let today = Local::now().format("%Y-%m-%d").to_string();
|
||||
let mut existing_events = match load_existing_events().await {
|
||||
Ok(events) => events,
|
||||
logger::log_info("Corporate Update: Loading existing events...").await;
|
||||
let existing_events = match load_existing_events(&paths).await {
|
||||
Ok(events) => {
|
||||
let msg = format!("Corporate Update: Loaded {} existing events", events.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
events
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Warning: Could not load existing events: {}", e);
|
||||
let msg = format!("Corporate Update: Warning - Could not load existing events: {}", e);
|
||||
eprintln!("{}", msg);
|
||||
logger::log_warn(&msg).await;
|
||||
HashMap::new()
|
||||
}
|
||||
};
|
||||
|
||||
// 5. Use the provided pool (no need to create a new one)
|
||||
let pool_size = pool.get_number_of_instances(); // Use the size from the shared pool
|
||||
logger::log_info(&format!("Corporate Update: Using pool size: {}", pool_size)).await;
|
||||
|
||||
// Process companies in parallel using the shared pool
|
||||
/*let results: Vec<_> = stream::iter(companies.into_iter())
|
||||
@@ -88,10 +149,14 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
|
||||
}
|
||||
}*/
|
||||
|
||||
save_optimized_events(existing_events).await?;
|
||||
logger::log_info(&format!("Corporate Update: Saving {} events to optimized storage", existing_events.len())).await;
|
||||
save_optimized_events(&paths, existing_events).await?;
|
||||
logger::log_info("Corporate Update: Events saved successfully").await;
|
||||
//save_changes(&all_changes).await?;
|
||||
|
||||
//println!("Corporate update complete — {} changes detected", all_changes.len());
|
||||
let msg = "✓ Corporate update complete";
|
||||
println!("{}", msg);
|
||||
logger::log_info(msg).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -7,39 +7,10 @@ const EXTRACTION_JS: &str = include_str!("extraction_script.js");
|
||||
|
||||
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
||||
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
|
||||
//dismiss_overlays(client).await?;
|
||||
|
||||
/*if let Ok(tab) = client.find(fantoccini::Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||
tab.click().await?;
|
||||
println!("High importance tab selected");
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
}*/
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
|
||||
for _ in 0..10 {
|
||||
let removed: bool = client
|
||||
.execute(
|
||||
r#"(() => {
|
||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||
if (iframe && iframe.parentNode) {
|
||||
iframe.parentNode.removeChild(iframe);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
})()"#,
|
||||
vec![],
|
||||
)
|
||||
.await?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
if removed { break; }
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
Ok(())
|
||||
}*/
|
||||
|
||||
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||
let script = format!(
|
||||
r#"
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
// src/economic/storage.rs
|
||||
use super::types::*;
|
||||
use super::helpers::*;
|
||||
use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
use tokio::fs;
|
||||
use chrono::{NaiveDate, Datelike};
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
let dir = std::path::Path::new("data/economic/events");
|
||||
pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
let dir = paths.economic_events_dir();
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
if dir.exists() {
|
||||
@@ -29,6 +31,7 @@ pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
}
|
||||
}
|
||||
chunks.sort_by_key(|c| c.start_date.clone());
|
||||
logger::log_info(&format!("Economic Storage: Scanned {} event chunks", chunks.len())).await;
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
@@ -41,25 +44,28 @@ pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMa
|
||||
map.insert(event_key(&e), e);
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Loaded {} events from {} chunks", map.len(), chunks.len())).await;
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
||||
let dir = std::path::Path::new("data/economic/events");
|
||||
pub async fn save_optimized_chunks(paths: &DataPaths, events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
||||
let dir = paths.economic_events_dir();
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
// Delete all old chunk files to prevent duplicates and overlaps
|
||||
println!("Removing old chunks...");
|
||||
logger::log_info("Economic Storage: Removing old chunk files...").await;
|
||||
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
let mut removed_count = 0;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
fs::remove_file(&path).await?;
|
||||
removed_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Removed {} old chunk files", removed_count)).await;
|
||||
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| e.date.clone());
|
||||
@@ -77,6 +83,7 @@ pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> an
|
||||
if !chunk.is_empty() {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
}
|
||||
logger::log_info(&format!("Economic Storage: Saved all event chunks to {:?}", dir)).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -85,14 +92,20 @@ async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::
|
||||
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
||||
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
||||
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
||||
logger::log_info(&format!("Economic Storage: Saved chunk {} - {} ({} events)", start, end, events.len())).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
||||
if changes.is_empty() { return Ok(()); }
|
||||
let dir = std::path::Path::new("economic_event_changes");
|
||||
pub async fn save_changes(paths: &DataPaths, changes: &[EventChange]) -> anyhow::Result<()> {
|
||||
if changes.is_empty() {
|
||||
logger::log_info("Economic Storage: No changes to save").await;
|
||||
return Ok(());
|
||||
}
|
||||
let dir = paths.economic_changes_dir();
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
logger::log_info(&format!("Economic Storage: Saving {} changes to {:?}", changes.len(), dir)).await;
|
||||
|
||||
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
|
||||
for c in changes {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||
@@ -107,8 +120,10 @@ pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
||||
let s = fs::read_to_string(&path).await?;
|
||||
serde_json::from_str(&s).unwrap_or_default()
|
||||
} else { vec![] };
|
||||
all.extend(list);
|
||||
all.extend(list.clone());
|
||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||
logger::log_info(&format!("Economic Storage: Saved {} changes for month {}", list.len(), month)).await;
|
||||
}
|
||||
logger::log_info("Economic Storage: All changes saved successfully").await;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
// src/economic/update.rs
|
||||
use super::{scraper::*, storage::*, helpers::*, types::*};
|
||||
use crate::{config::Config, scraper::webdriver::ScrapeTask};
|
||||
use crate::scraper::webdriver::ChromeDriverPool;
|
||||
use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
|
||||
use chrono::{Local};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -14,38 +13,69 @@ use std::sync::Arc;
|
||||
/// # Errors
|
||||
/// Returns an error if scraping, loading, or saving fails.
|
||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
||||
let paths = DataPaths::new(".")?;
|
||||
|
||||
logger::log_info("Economic Update: Initializing...").await;
|
||||
|
||||
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
||||
let end_date = config.target_end_date();
|
||||
|
||||
let chunks = scan_existing_chunks().await?;
|
||||
logger::log_info(&format!("Economic Update: Scanning existing chunks from {:?}", paths.economic_events_dir())).await;
|
||||
let chunks = scan_existing_chunks(&paths).await?;
|
||||
let mut events = load_existing_events(&chunks).await?;
|
||||
println!("Loaded {} events from {} chunks", events.len(), chunks.len());
|
||||
|
||||
let msg = format!("Economic Update: Loaded {} events from {} chunks", events.len(), chunks.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
|
||||
let start_date = if events.is_empty() {
|
||||
logger::log_warn("Economic Update: No existing events found, starting from config date").await;
|
||||
config.economic_start_date.clone()
|
||||
} else if events.values().any(|e| e.date >= today_str) {
|
||||
logger::log_info("Economic Update: Events exist for today, starting from today").await;
|
||||
today_str.clone()
|
||||
} else {
|
||||
events.values()
|
||||
let next = events.values()
|
||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
.max()
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or(today_str.clone())
|
||||
.unwrap_or(today_str.clone());
|
||||
logger::log_info(&format!("Economic Update: Resuming from: {}", next)).await;
|
||||
next
|
||||
};
|
||||
|
||||
println!("Scraping economic events: {} → {}", start_date, end_date);
|
||||
let msg = format!("Economic Update: Scraping events from {} → {}", start_date, end_date);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
|
||||
// Pass the pool to the scraping function
|
||||
let new_events_all = scrape_all_economic_events(&start_date, &end_date, pool).await?;
|
||||
|
||||
let msg = format!("Economic Update: Scraped {} new events", new_events_all.len());
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
|
||||
// Process all at once or in batches
|
||||
let result = process_batch(&new_events_all, &mut events, &today_str);
|
||||
let total_changes = result.changes.len();
|
||||
save_changes(&result.changes).await?;
|
||||
|
||||
let msg = format!("Economic Update: Detected {} changes", total_changes);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
|
||||
if total_changes > 0 {
|
||||
logger::log_info(&format!("Economic Update: Saving {} changes to log", total_changes)).await;
|
||||
save_changes(&paths, &result.changes).await?;
|
||||
logger::log_info("Economic Update: Changes saved successfully").await;
|
||||
}
|
||||
|
||||
save_optimized_chunks(events).await?;
|
||||
println!("Economic update complete — {} changes detected", total_changes);
|
||||
logger::log_info(&format!("Economic Update: Saving {} total events to chunks", events.len())).await;
|
||||
save_optimized_chunks(&paths, events).await?;
|
||||
|
||||
let msg = format!("✓ Economic update complete — {} changes detected", total_changes);
|
||||
println!("{}", msg);
|
||||
logger::log_info(&msg).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
15
src/lib.rs
Normal file
15
src/lib.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
// src/lib.rs
|
||||
//! Event Backtest Engine - Core Library
|
||||
//!
|
||||
//! Exposes all public modules for use in examples and tests
|
||||
|
||||
pub mod config;
|
||||
pub mod scraper;
|
||||
pub mod util;
|
||||
|
||||
// Re-export commonly used types for convenience
|
||||
pub use config::Config;
|
||||
pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask};
|
||||
pub use util::directories::DataPaths;
|
||||
pub use util::logger;
|
||||
pub use util::opnv;
|
||||
168
src/main.rs
168
src/main.rs
@@ -1,43 +1,169 @@
|
||||
// src/main.rs
|
||||
mod economic;
|
||||
mod corporate;
|
||||
|
||||
mod config;
|
||||
mod corporate;
|
||||
mod economic;
|
||||
mod util;
|
||||
mod scraper;
|
||||
|
||||
use anyhow::Result;
|
||||
use config::Config;
|
||||
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
|
||||
use scraper::webdriver::ChromeDriverPool;
|
||||
use util::directories::DataPaths;
|
||||
use util::{logger, opnv};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// The entry point of the application.
|
||||
///
|
||||
/// This function loads the configuration, initializes a shared ChromeDriver pool,
|
||||
/// and sequentially runs the full updates for corporate and economic data.
|
||||
/// Sequential execution helps prevent resource exhaustion from concurrent
|
||||
/// chromedriver instances and avoids spamming the target websites with too many requests.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if configuration loading fails, pool initialization fails,
|
||||
/// or if either update function encounters an issue (e.g., network errors,
|
||||
/// scraping failures, or chromedriver spawn failures like "program not found").
|
||||
/// Application entry point
|
||||
// src/main.rs
|
||||
|
||||
// ... existing imports ...
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
cleanup_all_proxy_containers().await.ok();
|
||||
|
||||
// Load configuration from .env
|
||||
let config = Config::load().map_err(|err| {
|
||||
println!("Failed to load Config .env: {}", err);
|
||||
eprintln!("Failed to load config: {}", err);
|
||||
err
|
||||
})?;
|
||||
|
||||
// Initialize the shared ChromeDriver pool once
|
||||
let pool_size = config.max_parallel_tasks;
|
||||
let pool = Arc::new(ChromeDriverPool::new(pool_size).await?);
|
||||
// Initialize paths and logger
|
||||
let paths = DataPaths::new(".")?;
|
||||
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
||||
logger::log_info("=== Event Backtest Engine Started ===").await;
|
||||
logger::log_info(&format!(
|
||||
"Config → parallel_instances: {}, task_limit: {} vpn_rotation: {}",
|
||||
config.max_parallel_instances,
|
||||
config.max_tasks_per_instance,
|
||||
config.enable_vpn_rotation
|
||||
)).await;
|
||||
|
||||
// Run economic update first, passing the shared pool
|
||||
// === Step 1: Fetch fresh VPNBook credentials and .ovpn files (if rotation enabled) ===
|
||||
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
|
||||
logger::log_info("VPN Rotation Enabled — Fetching latest VPNBook configs").await;
|
||||
|
||||
// We only need 1 Chrome instance to scrape vpnbook.com (no proxy yet)
|
||||
let temp_pool = Arc::new(ChromeDriverPool::new(1).await?);
|
||||
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
||||
|
||||
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
||||
|
||||
// Count how many distinct servers (subfolders) we have in cache/openvpn/
|
||||
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
||||
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
||||
.count();
|
||||
|
||||
if server_count == 0 {
|
||||
logger::log_warn("No VPN servers found — continuing without VPN").await;
|
||||
None
|
||||
} else {
|
||||
logger::log_info(&format!("Found {} VPN servers — starting Docker proxy containers", server_count)).await;
|
||||
|
||||
let pp = Arc::new(
|
||||
DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password).await?
|
||||
);
|
||||
|
||||
// Verify all proxies are working before proceeding
|
||||
logger::log_info("Verifying all proxy connections...").await;
|
||||
let mut all_working = true;
|
||||
for i in 0..pp.num_proxies() {
|
||||
match pp.test_proxy_connection(i).await {
|
||||
Ok(ip) => {
|
||||
logger::log_info(&format!(" Proxy {}: working with IP: {}", i + 1, ip)).await;
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_error(&format!(" Proxy {}: FAILED - {}", i + 1, e)).await;
|
||||
all_working = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !all_working {
|
||||
logger::log_warn("Some proxies failed, but continuing with working ones...").await;
|
||||
} else {
|
||||
logger::log_info("All proxies verified and ready!").await;
|
||||
}
|
||||
|
||||
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
|
||||
Some(pp)
|
||||
}
|
||||
} else {
|
||||
logger::log_info("VPN rotation disabled — using direct connection").await;
|
||||
None
|
||||
};
|
||||
|
||||
// === Step 2: Initialize the main ChromeDriver pool (with proxy if enabled) ===
|
||||
let pool_size = config.max_parallel_instances;
|
||||
let task_limit = config.max_tasks_per_instance;
|
||||
|
||||
logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", pool_size)).await;
|
||||
|
||||
let pool = Arc::new(
|
||||
if task_limit > 0 {
|
||||
ChromeDriverPool::new_with_proxy_and_task_limit(pool_size, proxy_pool.clone(), task_limit).await?
|
||||
} else {
|
||||
ChromeDriverPool::new_with_proxy(pool_size, proxy_pool.clone()).await?
|
||||
}
|
||||
);
|
||||
|
||||
logger::log_info(&format!("ChromeDriver pool ready with {} instances", pool_size)).await;
|
||||
|
||||
// === Step 3: Graceful Ctrl+C handler ===
|
||||
{
|
||||
let pool_clone = Arc::clone(&pool);
|
||||
let proxy_clone = proxy_pool.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
tokio::signal::ctrl_c().await.ok();
|
||||
|
||||
logger::log_info("Ctrl+C received — shutting down gracefully...").await;
|
||||
|
||||
// Now works: &*pool_clone derefs Arc → &ChromeDriverPool
|
||||
if let Err(e) = (&*pool_clone).shutdown().await {
|
||||
logger::log_error(&format!("Error during pool shutdown: {}", e)).await;
|
||||
}
|
||||
|
||||
if let Some(pp) = proxy_clone {
|
||||
if let Err(e) = pp.shutdown().await {
|
||||
logger::log_warn(&format!("Failed to stop Docker containers: {}", e)).await;
|
||||
} else {
|
||||
logger::log_info("All Docker VPN containers stopped").await;
|
||||
}
|
||||
}
|
||||
|
||||
let _ = cleanup_all_proxy_containers().await;
|
||||
|
||||
std::process::exit(0);
|
||||
});
|
||||
}
|
||||
|
||||
// === Step 4: Run the actual scraping jobs ===
|
||||
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
||||
economic::run_full_update(&config, &pool).await?;
|
||||
logger::log_info("Economic update completed").await;
|
||||
|
||||
// Then run corporate update, passing the shared pool
|
||||
logger::log_info("--- Starting CORPORATE data update ---").await;
|
||||
corporate::run_full_update(&config, &pool).await?;
|
||||
logger::log_info("Corporate update completed").await;
|
||||
|
||||
// === Step 5: Final cleanup ===
|
||||
logger::log_info("Shutting down ChromeDriver pool...").await;
|
||||
pool.shutdown().await?;
|
||||
|
||||
if let Some(pp) = proxy_pool {
|
||||
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
||||
pp.shutdown().await?;
|
||||
// CLEANUP ANY LEFTOVER CONTAINERS FROM PREVIOUS RUNS
|
||||
cleanup_all_proxy_containers().await.ok();
|
||||
}
|
||||
|
||||
logger::log_info("=== Application finished successfully ===").await;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
memory allocation of 4294967296 bytes failed
|
||||
error: process didn't exit successfully: `target\debug\event_backtest_engine.exe` (exit code: 0xc0000409, STATUS_STACK_BUFFER_OVERRUN)
|
||||
*/
|
||||
407
src/scraper/docker_vpn_proxy.rs
Normal file
407
src/scraper/docker_vpn_proxy.rs
Normal file
@@ -0,0 +1,407 @@
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use futures::future::join_all;
|
||||
use std::{path::{Path, PathBuf}, time::Duration};
|
||||
use tokio::{process::Command, time::{sleep}};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
pub struct DockerVpnProxyPool {
|
||||
container_names: Vec<String>,
|
||||
proxy_ports: Vec<u16>, // e.g., [10801, 10802, ...]
|
||||
}
|
||||
|
||||
impl DockerVpnProxyPool {
|
||||
pub async fn new(ovpn_dir: &Path, username: String, password: String) -> Result<Self> {
|
||||
// Count hostnames (subdirs in ovpn_dir)
|
||||
let hostnames: Vec<_> = std::fs::read_dir(ovpn_dir)?
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| e.path().is_dir())
|
||||
.map(|e| e.file_name().into_string().unwrap())
|
||||
.collect();
|
||||
|
||||
let num_servers = hostnames.len();
|
||||
if num_servers == 0 {
|
||||
return Err(anyhow!("No VPN hostnames found in {:?}", ovpn_dir));
|
||||
}
|
||||
|
||||
crate::util::logger::log_info(&format!("Found {} VPN hostnames", num_servers)).await;
|
||||
|
||||
let mut container_names = Vec::with_capacity(num_servers);
|
||||
let mut proxy_ports = Vec::with_capacity(num_servers);
|
||||
let base_port: u16 = 10800;
|
||||
|
||||
// === STEP 1: Start ALL containers first ===
|
||||
for (i, hostname) in hostnames.iter().enumerate() {
|
||||
// Pick tcp443.ovpn if exists, else first .ovpn
|
||||
let hostname_dir = ovpn_dir.join(hostname);
|
||||
let mut ovpn_path: Option<PathBuf> = None;
|
||||
for entry in WalkDir::new(&hostname_dir).max_depth(1) {
|
||||
let entry = entry?;
|
||||
if entry.path().extension().map_or(false, |ext| ext == "ovpn") {
|
||||
if entry.file_name().to_str().unwrap_or("").contains("tcp443") {
|
||||
ovpn_path = Some(entry.path().to_path_buf());
|
||||
break;
|
||||
} else if ovpn_path.is_none() {
|
||||
ovpn_path = Some(entry.path().to_path_buf());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let ovpn_path = ovpn_path.ok_or_else(|| anyhow!("No .ovpn found for {}", hostname))?;
|
||||
|
||||
let name = format!("vpn-proxy-{}", i);
|
||||
let port = base_port + i as u16 + 1;
|
||||
|
||||
// Clean up any existing container with the same name
|
||||
let _ = Command::new("docker")
|
||||
.args(["rm", "-f", &name])
|
||||
.status()
|
||||
.await;
|
||||
|
||||
// Run Docker container
|
||||
let status = Command::new("docker")
|
||||
.args([
|
||||
"run", "-d",
|
||||
"--name", &name,
|
||||
"--cap-add=NET_ADMIN",
|
||||
"--device", "/dev/net/tun",
|
||||
"--sysctl", "net.ipv4.ip_forward=1",
|
||||
"-v", &format!("{}:/vpn/config.ovpn", ovpn_path.display()),
|
||||
"-e", &format!("VPN_USERNAME={}", username),
|
||||
"-e", &format!("VPN_PASSWORD={}", password),
|
||||
"-p", &format!("{}:1080", port),
|
||||
"rust-vpn-proxy",
|
||||
])
|
||||
.status()
|
||||
.await
|
||||
.context("Failed to run Docker")?;
|
||||
|
||||
if !status.success() {
|
||||
return Err(anyhow!("Docker run failed for {}", name));
|
||||
}
|
||||
|
||||
crate::util::logger::log_info(&format!("Started container {} on port {} (waiting for VPN...)", name, port)).await;
|
||||
|
||||
container_names.push(name);
|
||||
proxy_ports.push(port);
|
||||
}
|
||||
|
||||
// Brief pause to let containers start
|
||||
sleep(Duration::from_secs(8)).await;
|
||||
crate::util::logger::log_info(&format!("All {} containers started, beginning health checks...", container_names.len())).await;
|
||||
|
||||
// === STEP 2: Test ALL proxies in parallel with 10-second intervals ===
|
||||
let results = Self::test_all_proxies_parallel(&container_names, &proxy_ports).await;
|
||||
|
||||
// Filter out failed containers
|
||||
let mut working_containers = Vec::new();
|
||||
let mut working_ports = Vec::new();
|
||||
let mut failed_count = 0;
|
||||
|
||||
for (i, (container_name, port)) in container_names.into_iter().zip(proxy_ports.into_iter()).enumerate() {
|
||||
match &results[i] {
|
||||
Ok(Some(ip)) => {
|
||||
crate::util::logger::log_info(&format!("✓ Container {} on port {} ready with IP: {}",
|
||||
container_name, port, ip)).await;
|
||||
working_containers.push(container_name);
|
||||
working_ports.push(port);
|
||||
}
|
||||
Ok(None) => {
|
||||
crate::util::logger::log_warn(&format!("✓ Container {} on port {} ready but IP detection failed",
|
||||
container_name, port)).await;
|
||||
working_containers.push(container_name);
|
||||
working_ports.push(port);
|
||||
}
|
||||
Err(e) => {
|
||||
// Get container logs to debug
|
||||
let logs = Command::new("docker")
|
||||
.args(["logs", "--tail", "20", &container_name])
|
||||
.output()
|
||||
.await
|
||||
.ok()
|
||||
.and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into());
|
||||
|
||||
crate::util::logger::log_error(&format!("✗ Container {} on port {} failed: {}. Logs: {:?}",
|
||||
container_name, port, e, logs)).await;
|
||||
failed_count += 1;
|
||||
// Clean up failed container
|
||||
let _ = Self::cleanup_container(&container_name).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if working_containers.is_empty() {
|
||||
return Err(anyhow!("All {} VPN proxy containers failed to start", num_servers));
|
||||
}
|
||||
|
||||
crate::util::logger::log_info(&format!("Started {}/{} VPN proxy containers successfully",
|
||||
working_containers.len(), num_servers)).await;
|
||||
|
||||
if failed_count > 0 {
|
||||
crate::util::logger::log_warn(&format!("{} containers failed and were cleaned up", failed_count)).await;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
container_names: working_containers,
|
||||
proxy_ports: working_ports,
|
||||
})
|
||||
}
|
||||
|
||||
/// Test all proxies in parallel with 10-second intervals between tests
|
||||
async fn test_all_proxies_parallel(container_names: &[String], proxy_ports: &[u16]) -> Vec<Result<Option<String>>> {
|
||||
let mut tasks = Vec::new();
|
||||
|
||||
for (i, (container_name, port)) in container_names.iter().zip(proxy_ports.iter()).enumerate() {
|
||||
let name = container_name.clone();
|
||||
let port = *port;
|
||||
|
||||
tasks.push(tokio::spawn(async move {
|
||||
// Try up to 6 times with 10-second intervals (total 60 seconds)
|
||||
for attempt in 1..=6 {
|
||||
crate::util::logger::log_info(&format!("Testing proxy {} (port {}) - Attempt {}/6",
|
||||
name, port, attempt)).await;
|
||||
|
||||
match Self::test_single_proxy(port).await {
|
||||
Ok(Some(ip)) => {
|
||||
return Ok(Some(ip));
|
||||
}
|
||||
Ok(None) => {
|
||||
// Connection works but IP detection failed
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) if attempt < 6 => {
|
||||
crate::util::logger::log_info(&format!("Attempt {}/6 for {}: {} - retrying in 10s",
|
||||
attempt, name, e)).await;
|
||||
sleep(Duration::from_secs(10)).await;
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow!("Failed after 6 attempts: {}", e));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(anyhow!("Unexpected exit from retry loop"))
|
||||
}));
|
||||
}
|
||||
|
||||
// Wait for all tasks to complete
|
||||
join_all(tasks)
|
||||
.await
|
||||
.into_iter()
|
||||
.map(|result| match result {
|
||||
Ok(inner) => inner,
|
||||
Err(e) => Err(anyhow!("Task panicked: {}", e)),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Test a single proxy connection
|
||||
async fn test_single_proxy(port: u16) -> Result<Option<String>> {
|
||||
use std::io::{Read, Write};
|
||||
use std::net::TcpStream;
|
||||
use std::time::Duration as StdDuration;
|
||||
|
||||
// First, test SOCKS5 handshake directly
|
||||
crate::util::logger::log_info(&format!("Testing SOCKS5 handshake on port {}...", port)).await;
|
||||
|
||||
// Use spawn_blocking for synchronous I/O
|
||||
let test_result = tokio::task::spawn_blocking(move || {
|
||||
// Connect to SOCKS5 proxy
|
||||
let mut stream = match TcpStream::connect_timeout(
|
||||
&format!("127.0.0.1:{}", port).parse().unwrap(),
|
||||
StdDuration::from_secs(5)
|
||||
) {
|
||||
Ok(stream) => stream,
|
||||
Err(e) => return Err(anyhow!("Failed to connect: {}", e)),
|
||||
};
|
||||
|
||||
// Send SOCKS5 greeting: version 5, 1 method (no auth)
|
||||
let greeting: [u8; 3] = [0x05, 0x01, 0x00]; // SOCKS5, 1 method, no auth
|
||||
if let Err(e) = stream.write_all(&greeting) {
|
||||
return Err(anyhow!("Failed to send greeting: {}", e));
|
||||
}
|
||||
|
||||
// Read response
|
||||
let mut response = [0u8; 2];
|
||||
if let Err(e) = stream.read_exact(&mut response) {
|
||||
return Err(anyhow!("Failed to read response: {}", e));
|
||||
}
|
||||
|
||||
// Check response: should be [0x05, 0x00] for no auth required
|
||||
if response[0] != 0x05 || response[1] != 0x00 {
|
||||
return Err(anyhow!("Unexpected SOCKS5 response: {:?}", response));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}).await;
|
||||
|
||||
match test_result {
|
||||
Ok(Ok(())) => {
|
||||
crate::util::logger::log_info(&format!("✓ SOCKS5 proxy on port {} accepts connections", port)).await;
|
||||
|
||||
// Try to get IP through proxy using curl (fallback method)
|
||||
let curl_result = tokio::process::Command::new("curl")
|
||||
.args([
|
||||
"-s",
|
||||
"--socks5", &format!("localhost:{}", port),
|
||||
"--max-time", "10",
|
||||
"https://checkip.amazonaws.com"
|
||||
])
|
||||
.output()
|
||||
.await;
|
||||
|
||||
match curl_result {
|
||||
Ok(output) if output.status.success() => {
|
||||
let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||
if Self::is_valid_ip(&ip) {
|
||||
crate::util::logger::log_info(&format!("✓ Got IP via proxy: {}", ip)).await;
|
||||
return Ok(Some(ip));
|
||||
} else {
|
||||
crate::util::logger::log_info(&format!("✓ Proxy works, invalid IP format: {}", ip)).await;
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Proxy accepts connections but curl failed - still acceptable
|
||||
crate::util::logger::log_info(&format!("✓ Proxy accepts connections (curl test failed)")).await;
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
return Err(anyhow!("SOCKS5 test failed: {}", e));
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow!("Task failed: {}", e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Clean up a failed container
|
||||
async fn cleanup_container(container_name: &str) -> Result<()> {
|
||||
let _ = Command::new("docker")
|
||||
.args(["stop", container_name])
|
||||
.status()
|
||||
.await;
|
||||
|
||||
let _ = Command::new("docker")
|
||||
.args(["rm", container_name])
|
||||
.status()
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_valid_ip(ip: &str) -> bool {
|
||||
let parts: Vec<&str> = ip.split('.').collect();
|
||||
if parts.len() != 4 {
|
||||
return false;
|
||||
}
|
||||
|
||||
for part in parts {
|
||||
if let Ok(num) = part.parse::<u8>() {
|
||||
if part != num.to_string() {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
/// Test if a specific proxy is working
|
||||
pub async fn test_proxy_connection(&self, index: usize) -> Result<String> {
|
||||
let port = self.proxy_ports[index];
|
||||
let proxy_url = format!("socks5://localhost:{}", port);
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.proxy(reqwest::Proxy::all(&proxy_url)?)
|
||||
.timeout(Duration::from_secs(10))
|
||||
.build()?;
|
||||
|
||||
let response = client.get("http://checkip.amazonaws.com")
|
||||
.send()
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
|
||||
Ok(response.trim().to_string())
|
||||
}
|
||||
|
||||
pub fn get_proxy_url(&self, index: usize) -> String {
|
||||
let port = self.proxy_ports[index % self.proxy_ports.len()];
|
||||
format!("socks5://localhost:{}", port)
|
||||
}
|
||||
|
||||
pub fn num_proxies(&self) -> usize {
|
||||
self.proxy_ports.len()
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) -> Result<()> {
|
||||
crate::util::logger::log_info(&format!("Shutting down {} Docker proxy containers...",
|
||||
self.container_names.len())).await;
|
||||
|
||||
for name in &self.container_names {
|
||||
let _ = Command::new("docker")
|
||||
.args(["stop", name])
|
||||
.status()
|
||||
.await;
|
||||
let _ = Command::new("docker")
|
||||
.args(["rm", name])
|
||||
.status()
|
||||
.await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn cleanup_all_proxy_containers() -> Result<()> {
|
||||
// Step 1: List all container IDs that match our pattern
|
||||
let output = Command::new("docker")
|
||||
.args(["ps", "-a", "--format", "{{.ID}} {{.Names}} {{.Image}}"])
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let mut containers_to_kill = Vec::new();
|
||||
|
||||
for line in stdout.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 {
|
||||
let name_or_id = parts[0];
|
||||
let name = parts[1];
|
||||
let image = if parts.len() >= 3 { parts[2] } else { "" };
|
||||
|
||||
// Match by name prefix OR by image name
|
||||
if name.starts_with("vpn-proxy-") || image.contains("rust-vpn-proxy") {
|
||||
containers_to_kill.push(name_or_id.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if containers_to_kill.is_empty() {
|
||||
crate::util::logger::log_info("No old rust-vpn-proxy containers found").await;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Step 2: Kill and remove them all at once
|
||||
let status = Command::new("docker")
|
||||
.arg("rm")
|
||||
.arg("-f")
|
||||
.args(&containers_to_kill)
|
||||
.status()
|
||||
.await?;
|
||||
|
||||
if status.success() {
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Successfully removed {} old rust-vpn-proxy container(s)",
|
||||
containers_to_kill.len()
|
||||
))
|
||||
.await;
|
||||
} else {
|
||||
crate::util::logger::log_warn("Some containers may still remain (non-critical)").await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1 +1,2 @@
|
||||
pub mod webdriver;
|
||||
pub mod docker_vpn_proxy;
|
||||
@@ -3,212 +3,269 @@
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use fantoccini::{Client, ClientBuilder};
|
||||
use serde_json::{Map, Value};
|
||||
use std::pin::Pin;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||
use tokio::process::{Child, Command};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::sync::{Mutex, Semaphore};
|
||||
use tokio::time::{Duration, sleep, timeout};
|
||||
use std::pin::Pin;
|
||||
use tokio::time::{sleep, timeout, Duration};
|
||||
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
|
||||
|
||||
/// Manages a pool of ChromeDriver instances for parallel scraping.
|
||||
///
|
||||
/// This struct maintains multiple ChromeDriver processes and allows controlled
|
||||
/// concurrent access via a semaphore. Instances are reused across tasks to avoid
|
||||
/// the overhead of spawning new processes.
|
||||
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
|
||||
pub struct ChromeDriverPool {
|
||||
instances: Vec<Arc<Mutex<ChromeInstance>>>,
|
||||
semaphore: Arc<Semaphore>,
|
||||
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
}
|
||||
|
||||
impl ChromeDriverPool {
|
||||
/// Creates a new pool with the specified number of ChromeDriver instances.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `pool_size` - Number of concurrent ChromeDriver instances to maintain
|
||||
/// Creates a new pool without any proxy (direct connection).
|
||||
pub async fn new(pool_size: usize) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
|
||||
}
|
||||
|
||||
/// Creates a new pool with task-per-instance limit but no proxy.
|
||||
pub async fn new_with_task_limit(pool_size: usize, max_tasks_per_instance: usize) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, None, max_tasks_per_instance).await
|
||||
}
|
||||
|
||||
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
|
||||
pub async fn new_with_proxy(
|
||||
pool_size: usize,
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
) -> Result<Self> {
|
||||
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
|
||||
}
|
||||
|
||||
/// Full constructor: supports proxy + task limiting.
|
||||
pub async fn new_with_proxy_and_task_limit(
|
||||
pool_size: usize,
|
||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||
max_tasks_per_instance: usize,
|
||||
) -> Result<Self> {
|
||||
let mut instances = Vec::with_capacity(pool_size);
|
||||
|
||||
println!("Initializing ChromeDriver pool with {} instances...", pool_size);
|
||||
|
||||
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Initializing ChromeDriver pool with {} instances{}...",
|
||||
pool_size,
|
||||
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" }
|
||||
))
|
||||
.await;
|
||||
|
||||
for i in 0..pool_size {
|
||||
match ChromeInstance::new().await {
|
||||
Ok(instance) => {
|
||||
println!(" ✓ Instance {} ready", i + 1);
|
||||
instances.push(Arc::new(Mutex::new(instance)));
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!(" ✗ Failed to create instance {}: {}", i + 1, e);
|
||||
// Clean up already created instances
|
||||
drop(instances);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
let proxy_url = proxy_pool
|
||||
.as_ref()
|
||||
.map(|pp| pp.get_proxy_url(i));
|
||||
|
||||
let instance = ChromeInstance::new(proxy_url, max_tasks_per_instance).await?;
|
||||
|
||||
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
||||
instances.push(Arc::new(Mutex::new(instance)));
|
||||
}
|
||||
|
||||
|
||||
Ok(Self {
|
||||
instances,
|
||||
semaphore: Arc::new(Semaphore::new(pool_size)),
|
||||
proxy_pool,
|
||||
})
|
||||
}
|
||||
|
||||
/// Executes a scrape task using an available instance from the pool.
|
||||
/// Execute a scraping task using an available instance from the pool.
|
||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||
where
|
||||
T: Send + 'static,
|
||||
F: FnOnce(Client) -> Fut + Send + 'static,
|
||||
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
||||
Fut: std::future::Future<Output = Result<T>> + Send,
|
||||
{
|
||||
// Acquire semaphore permit
|
||||
let _permit = self.semaphore.acquire().await
|
||||
.map_err(|_| anyhow!("Semaphore closed"))?;
|
||||
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
||||
|
||||
// Find an available instance (round-robin or first available)
|
||||
let instance = self.instances[0].clone(); // Simple: use first, could be round-robin
|
||||
// Round-robin selection
|
||||
let index = rand::random_range(..self.instances.len());
|
||||
let instance = self.instances[index].clone();
|
||||
let mut guard = instance.lock().await;
|
||||
|
||||
// Create a new session for this task
|
||||
let client = guard.new_session().await?;
|
||||
|
||||
// Release lock while we do the actual scraping
|
||||
drop(guard);
|
||||
guard.increment_task_count();
|
||||
|
||||
// Navigate and parse
|
||||
client.goto(&url).await.context("Failed to navigate")?;
|
||||
let result = timeout(Duration::from_secs(60), parse(client))
|
||||
if guard.max_tasks_per_instance > 0 {
|
||||
crate::util::logger::log_info(&format!(
|
||||
"Instance task count: {}/{}",
|
||||
guard.get_task_count(),
|
||||
guard.max_tasks_per_instance
|
||||
))
|
||||
.await;
|
||||
}
|
||||
|
||||
let client = guard.new_session().await?;
|
||||
|
||||
drop(guard); // release lock early
|
||||
|
||||
crate::util::logger::log_info(&format!("Scraping {} ...", url)).await;
|
||||
client.goto(&url).await.context("Navigation failed")?;
|
||||
|
||||
let result = timeout(Duration::from_secs(90), parse(client))
|
||||
.await
|
||||
.context("Parse function timed out after 60s")??;
|
||||
.context("Parse timeout")??;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn get_number_of_instances (&self) -> usize {
|
||||
self.instances.len()
|
||||
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
||||
pub async fn shutdown(&self) -> Result<()> {
|
||||
for inst in &self.instances {
|
||||
let mut guard = inst.lock().await;
|
||||
guard.shutdown().await?;
|
||||
}
|
||||
|
||||
if let Some(pp) = &self.proxy_pool {
|
||||
pp.shutdown().await?;
|
||||
crate::util::logger::log_info("All Docker VPN proxy containers stopped").await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_number_of_instances(&self) -> usize {
|
||||
self.instances.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a single instance of chromedriver process.
|
||||
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
|
||||
pub struct ChromeInstance {
|
||||
process: Child,
|
||||
base_url: String,
|
||||
process: Child,
|
||||
stderr_log: Option<JoinHandle<()>>,
|
||||
task_count: usize,
|
||||
max_tasks_per_instance: usize,
|
||||
proxy_url: Option<String>,
|
||||
}
|
||||
|
||||
impl ChromeInstance {
|
||||
/// Creates a new ChromeInstance by spawning chromedriver with random port.
|
||||
///
|
||||
/// This spawns `chromedriver --port=0` to avoid port conflicts, reads stdout to extract
|
||||
/// the listening address, and waits for the success message. If timeout occurs or
|
||||
/// spawning fails, returns an error with context.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if chromedriver fails to spawn (e.g., not in PATH, version mismatch),
|
||||
/// if the process exits early, or if the address/success message isn't found within 30s.
|
||||
pub async fn new() -> Result<Self> {
|
||||
let mut command = Command::new("chromedriver-win64/chromedriver.exe");
|
||||
command
|
||||
.arg("--port=0") // Use random available port to support pooling
|
||||
pub async fn new(proxy_url: Option<String>, max_tasks_per_instance: usize) -> Result<Self> {
|
||||
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
||||
|
||||
Ok(Self {
|
||||
base_url,
|
||||
process,
|
||||
stderr_log: Some(stderr_handle),
|
||||
task_count: 0,
|
||||
max_tasks_per_instance,
|
||||
proxy_url,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn new_session(&self) -> Result<Client> {
|
||||
ClientBuilder::native()
|
||||
.capabilities(self.chrome_args())
|
||||
.connect(&self.base_url)
|
||||
.await
|
||||
.context("Failed to connect to ChromeDriver")
|
||||
}
|
||||
|
||||
pub fn increment_task_count(&mut self) {
|
||||
self.task_count += 1;
|
||||
}
|
||||
|
||||
pub fn get_task_count(&self) -> usize {
|
||||
self.task_count
|
||||
}
|
||||
|
||||
pub async fn shutdown(&mut self) -> Result<()> {
|
||||
if let Some(handle) = self.stderr_log.take() {
|
||||
handle.abort();
|
||||
let _ = handle.await;
|
||||
}
|
||||
|
||||
let _ = self.process.start_kill();
|
||||
let _ = self.process.wait().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawns the actual `chromedriver` binary and waits for it to become ready.
|
||||
async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> {
|
||||
let mut process = Command::new("chromedriver-win64/chromedriver.exe")
|
||||
.arg("--port=0") // let OS choose free port
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped());
|
||||
|
||||
let mut process = command
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
|
||||
.context("Failed to start chromedriver. Is it in PATH?")?;
|
||||
|
||||
let mut stdout = BufReader::new(
|
||||
process.stdout.take().context("Failed to capture stdout")?
|
||||
).lines();
|
||||
let stdout = process.stdout.take().unwrap();
|
||||
let stderr = process.stderr.take().unwrap();
|
||||
|
||||
let mut stderr = BufReader::new(
|
||||
process.stderr.take().context("Failed to capture stderr")?
|
||||
).lines();
|
||||
let stdout_reader = BufReader::new(stdout);
|
||||
let mut stdout_lines = stdout_reader.lines();
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
let mut address: Option<String> = None;
|
||||
let mut success = false;
|
||||
|
||||
// Log stderr in background for debugging
|
||||
tokio::spawn(async move {
|
||||
while let Ok(Some(line)) = stderr.next_line().await {
|
||||
eprintln!("ChromeDriver stderr: {}", line);
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
let stderr_handle = tokio::spawn(async move {
|
||||
let mut lines = stderr_reader.lines();
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
let t = line.trim();
|
||||
if !t.is_empty() {
|
||||
let _ = crate::util::logger::log_info(&format!("ChromeDriver: {}", t)).await;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for address and success (up to 30s)
|
||||
while start_time.elapsed() < Duration::from_secs(30) {
|
||||
if let Ok(Ok(Some(line))) =
|
||||
timeout(Duration::from_secs(1), stdout.next_line()).await
|
||||
{
|
||||
let start = tokio::time::Instant::now();
|
||||
let mut address: Option<String> = None;
|
||||
|
||||
while start.elapsed() < Duration::from_secs(30) {
|
||||
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout_lines.next_line()).await {
|
||||
if let Some(addr) = parse_chromedriver_address(&line) {
|
||||
address = Some(addr.to_string());
|
||||
address = Some(addr);
|
||||
}
|
||||
|
||||
if line.contains("ChromeDriver was started successfully") {
|
||||
success = true;
|
||||
}
|
||||
|
||||
if let (Some(addr), true) = (&address, success) {
|
||||
return Ok(Self {
|
||||
process,
|
||||
base_url: addr.clone(),
|
||||
});
|
||||
if line.contains("ChromeDriver was started successfully") && address.is_some() {
|
||||
return Ok((address.unwrap(), process, stderr_handle));
|
||||
}
|
||||
}
|
||||
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
|
||||
// Cleanup on failure
|
||||
let _ = process.kill().await;
|
||||
Err(anyhow!("Timeout: ChromeDriver did not start within 30 seconds. Check version match with Chrome browser and system resources."))
|
||||
stderr_handle.abort();
|
||||
Err(anyhow!("ChromeDriver failed to start within 30s"))
|
||||
}
|
||||
|
||||
/// Creates a new browser session (client) from this ChromeDriver instance.
|
||||
/// Each session is independent and can be closed without affecting the driver.
|
||||
pub async fn new_session(&self) -> Result<Client> {
|
||||
ClientBuilder::native()
|
||||
.capabilities(Self::chrome_args())
|
||||
.connect(&self.base_url)
|
||||
.await
|
||||
.context("Failed to create new session")
|
||||
}
|
||||
|
||||
fn chrome_args() -> Map<String, Value> {
|
||||
let args = serde_json::json!({
|
||||
fn chrome_args(&self) -> Map<String, Value> {
|
||||
let mut args = vec![
|
||||
"--headless=new".to_string(),
|
||||
"--disable-gpu".to_string(),
|
||||
"--no-sandbox".to_string(),
|
||||
"--disable-dev-shm-usage".to_string(),
|
||||
"--disable-infobars".to_string(),
|
||||
"--disable-extensions".to_string(),
|
||||
"--disable-popup-blocking".to_string(),
|
||||
"--disable-notifications".to_string(),
|
||||
"--disable-logging".to_string(),
|
||||
"--disable-autofill".to_string(),
|
||||
"--disable-sync".to_string(),
|
||||
"--disable-default-apps".to_string(),
|
||||
"--disable-translate".to_string(),
|
||||
"--window-size=1920,1080".to_string(),
|
||||
"--disable-blink-features=AutomationControlled".to_string(),
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36".to_string()
|
||||
];
|
||||
if let Some(ref proxy) = self.proxy_url {
|
||||
let proxy = proxy.clone();
|
||||
let proxy_formatted = format!("--proxy-server={}", proxy);
|
||||
args.push(proxy_formatted);
|
||||
}
|
||||
let caps = serde_json::json!({
|
||||
"goog:chromeOptions": {
|
||||
"args": [
|
||||
"--headless=new",
|
||||
"--disable-gpu",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-infobars",
|
||||
"--disable-extensions",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-notifications",
|
||||
"--disable-logging",
|
||||
"--disable-autofill",
|
||||
"--disable-features=TranslateUI,OptimizationGuideModelDownloading",
|
||||
"--window-size=1920,1080",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
],
|
||||
"args": args,
|
||||
"excludeSwitches": ["enable-logging", "enable-automation"],
|
||||
"useAutomationExtension": false,
|
||||
"prefs": {
|
||||
"profile.default_content_setting_values.notifications": 2
|
||||
}
|
||||
}
|
||||
});
|
||||
args.as_object()
|
||||
.expect("Capabilities should be a JSON object")
|
||||
.clone()
|
||||
caps.as_object().cloned().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses the ChromeDriver address from a log line.
|
||||
///
|
||||
/// Looks for the "Starting ChromeDriver ... on port XXXX" line and extracts the port.
|
||||
/// Returns `Some("http://localhost:XXXX")` if found, else `None`.
|
||||
fn parse_chromedriver_address(line: &str) -> Option<String> {
|
||||
if line.contains("Starting ChromeDriver") {
|
||||
if let Some(port_str) = line.split("on port ").nth(1) {
|
||||
@@ -219,7 +276,6 @@ fn parse_chromedriver_address(line: &str) -> Option<String> {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fallback for other formats (e.g., explicit port mentions)
|
||||
for word in line.split_whitespace() {
|
||||
if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::<u16>() {
|
||||
if port > 1024 && port < 65535 && line.to_lowercase().contains("port") {
|
||||
@@ -232,17 +288,18 @@ fn parse_chromedriver_address(line: &str) -> Option<String> {
|
||||
|
||||
impl Drop for ChromeInstance {
|
||||
fn drop(&mut self) {
|
||||
// Signal child to terminate. Do NOT block here; shutdown should be
|
||||
// performed with the async `shutdown()` method when possible.
|
||||
let _ = self.process.start_kill();
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
|
||||
/// Simplified task execution - now uses the pool pattern.
|
||||
///
|
||||
/// For backwards compatibility with existing code.
|
||||
/// Simplified task execution - uses the pool pattern.
|
||||
pub struct ScrapeTask<T> {
|
||||
url: String,
|
||||
parse: Box<dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send>,
|
||||
parse: Box<
|
||||
dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send,
|
||||
>,
|
||||
}
|
||||
|
||||
impl<T: Send + 'static> ScrapeTask<T> {
|
||||
@@ -257,13 +314,11 @@ impl<T: Send + 'static> ScrapeTask<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes using a provided pool (more efficient for multiple tasks).
|
||||
pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
|
||||
let url = self.url;
|
||||
let parse = self.parse;
|
||||
|
||||
pool.execute(url, move |client| async move {
|
||||
(parse)(client).await
|
||||
}).await
|
||||
|
||||
pool.execute(url, move |client| async move { (parse)(client).await })
|
||||
.await
|
||||
}
|
||||
}
|
||||
22
src/util.rs
22
src/util.rs
@@ -1,22 +0,0 @@
|
||||
// src/util.rs (or put it directly in main.rs if you prefer)
|
||||
use tokio::fs;
|
||||
use std::path::Path;
|
||||
|
||||
/// Create the required data folders if they do not exist yet.
|
||||
pub async fn _ensure_data_dirs() -> anyhow::Result<()> {
|
||||
let dirs = [
|
||||
"economic_events",
|
||||
"economic_event_changes",
|
||||
"corporate_events",
|
||||
"corporate_prices",
|
||||
"data",
|
||||
];
|
||||
for dir in dirs {
|
||||
let path = Path::new(dir);
|
||||
if !path.exists() {
|
||||
tokio::fs::create_dir_all(path).await?;
|
||||
println!("Created directory: {dir}");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
169
src/util/directories.rs
Normal file
169
src/util/directories.rs
Normal file
@@ -0,0 +1,169 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::fs;
|
||||
|
||||
/// Central configuration for all data paths
|
||||
pub struct DataPaths {
|
||||
base_dir: PathBuf,
|
||||
data_dir: PathBuf,
|
||||
cache_dir: PathBuf,
|
||||
logs_dir: PathBuf,
|
||||
// Cache data subdirectories
|
||||
cache_gleif_dir: PathBuf,
|
||||
cache_openfigi_dir: PathBuf,
|
||||
cache_gleif_openfigi_map_dir: PathBuf,
|
||||
cache_openvpn_dir: PathBuf,
|
||||
// Economic data subdirectories
|
||||
economic_events_dir: PathBuf,
|
||||
economic_changes_dir: PathBuf,
|
||||
// Corporate data subdirectories
|
||||
corporate_events_dir: PathBuf,
|
||||
corporate_changes_dir: PathBuf,
|
||||
corporate_prices_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl DataPaths {
|
||||
/// Initialize paths from a base directory
|
||||
pub fn new(base_dir: impl AsRef<Path>) -> std::io::Result<Self> {
|
||||
let base_dir = base_dir.as_ref().to_path_buf();
|
||||
|
||||
let data_dir = base_dir.join("data");
|
||||
let cache_dir = base_dir.join("cache");
|
||||
let logs_dir = base_dir.join("logs");
|
||||
|
||||
// Cache subdirectories
|
||||
let cache_gleif_dir = cache_dir.join("gleif");
|
||||
let cache_openfigi_dir = cache_dir.join("openfigi");
|
||||
let cache_gleif_openfigi_map_dir = cache_dir.join("glei_openfigi");
|
||||
let cache_openvpn_dir = cache_dir.join("openvpn");
|
||||
|
||||
// Economic subdirectories
|
||||
let economic_events_dir = data_dir.join("economic").join("events");
|
||||
let economic_changes_dir = economic_events_dir.join("changes");
|
||||
|
||||
// Corporate subdirectories
|
||||
let corporate_dir = data_dir.join("corporate");
|
||||
let corporate_events_dir = corporate_dir.join("events");
|
||||
let corporate_changes_dir = corporate_events_dir.join("changes");
|
||||
let corporate_prices_dir = corporate_dir.join("prices");
|
||||
|
||||
// Create all directories if they don't exist
|
||||
fs::create_dir_all(&data_dir)?;
|
||||
fs::create_dir_all(&cache_dir)?;
|
||||
fs::create_dir_all(&logs_dir)?;
|
||||
fs::create_dir_all(&cache_gleif_dir)?;
|
||||
fs::create_dir_all(&cache_openfigi_dir)?;
|
||||
fs::create_dir_all(&cache_gleif_openfigi_map_dir)?;
|
||||
fs::create_dir_all(&cache_openvpn_dir)?;
|
||||
fs::create_dir_all(&economic_events_dir)?;
|
||||
fs::create_dir_all(&economic_changes_dir)?;
|
||||
fs::create_dir_all(&corporate_events_dir)?;
|
||||
fs::create_dir_all(&corporate_changes_dir)?;
|
||||
fs::create_dir_all(&corporate_prices_dir)?;
|
||||
|
||||
Ok(Self {
|
||||
base_dir,
|
||||
data_dir,
|
||||
cache_dir,
|
||||
logs_dir,
|
||||
cache_gleif_dir,
|
||||
cache_openfigi_dir,
|
||||
cache_gleif_openfigi_map_dir,
|
||||
cache_openvpn_dir,
|
||||
economic_events_dir,
|
||||
economic_changes_dir,
|
||||
corporate_events_dir,
|
||||
corporate_changes_dir,
|
||||
corporate_prices_dir,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn base_dir(&self) -> &Path {
|
||||
&self.base_dir
|
||||
}
|
||||
|
||||
pub fn data_dir(&self) -> &Path {
|
||||
&self.data_dir
|
||||
}
|
||||
|
||||
pub fn cache_dir(&self) -> &Path {
|
||||
&self.cache_dir
|
||||
}
|
||||
|
||||
pub fn logs_dir(&self) -> &Path {
|
||||
&self.logs_dir
|
||||
}
|
||||
|
||||
pub fn cache_gleif_dir(&self) -> &Path {
|
||||
&self.cache_gleif_dir
|
||||
}
|
||||
|
||||
pub fn cache_openfigi_dir(&self) -> &Path {
|
||||
&self.cache_openfigi_dir
|
||||
}
|
||||
|
||||
pub fn cache_gleif_openfigi_map_dir(&self) -> &Path {
|
||||
&self.cache_gleif_openfigi_map_dir
|
||||
}
|
||||
|
||||
pub fn cache_openvpn_dir(&self) -> &Path {
|
||||
&self.cache_openvpn_dir
|
||||
}
|
||||
|
||||
/// Get the economic events directory
|
||||
pub fn economic_events_dir(&self) -> &Path {
|
||||
&self.economic_events_dir
|
||||
}
|
||||
|
||||
/// Get the economic changes directory
|
||||
pub fn economic_changes_dir(&self) -> &Path {
|
||||
&self.economic_changes_dir
|
||||
}
|
||||
|
||||
/// Get the corporate events directory
|
||||
pub fn corporate_events_dir(&self) -> &Path {
|
||||
&self.corporate_events_dir
|
||||
}
|
||||
|
||||
/// Get the corporate changes directory
|
||||
pub fn corporate_changes_dir(&self) -> &Path {
|
||||
&self.corporate_changes_dir
|
||||
}
|
||||
|
||||
/// Get the corporate prices directory
|
||||
pub fn corporate_prices_dir(&self) -> &Path {
|
||||
&self.corporate_prices_dir
|
||||
}
|
||||
|
||||
/// Get a specific file path within data directory
|
||||
pub fn data_file(&self, filename: &str) -> PathBuf {
|
||||
self.data_dir.join(filename)
|
||||
}
|
||||
|
||||
/// Get a specific file path within cache directory
|
||||
pub fn cache_file(&self, filename: &str) -> PathBuf {
|
||||
self.cache_dir.join(filename)
|
||||
}
|
||||
|
||||
/// Get a specific file path within logs directory
|
||||
pub fn log_file(&self, filename: &str) -> PathBuf {
|
||||
self.logs_dir.join(filename)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_paths_creation() {
|
||||
let paths = DataPaths::new("./test_base").unwrap();
|
||||
assert!(paths.data_dir().exists());
|
||||
assert!(paths.cache_dir().exists());
|
||||
assert!(paths.logs_dir().exists());
|
||||
assert!(paths.economic_events_dir().exists());
|
||||
assert!(paths.economic_changes_dir().exists());
|
||||
assert!(paths.corporate_events_dir().exists());
|
||||
assert!(paths.corporate_changes_dir().exists());
|
||||
assert!(paths.corporate_prices_dir().exists());
|
||||
}
|
||||
}
|
||||
78
src/util/logger.rs
Normal file
78
src/util/logger.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
// src/util/logger.rs
|
||||
use chrono::Local;
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::sync::Mutex;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
|
||||
static LOGGER: Lazy<Mutex<Option<DebugLogger>>> = Lazy::new(|| Mutex::new(None));
|
||||
|
||||
pub struct DebugLogger {
|
||||
file: std::fs::File,
|
||||
log_path: PathBuf,
|
||||
}
|
||||
|
||||
impl DebugLogger {
|
||||
fn new(log_dir: &std::path::Path) -> std::io::Result<Self> {
|
||||
|
||||
fs::create_dir_all(log_dir)?;
|
||||
let filename = format!("backtest_{}.log", Local::now().format("%Y%m%d_%H%M%S"));
|
||||
let log_path = log_dir.join(&filename);
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_path)?;
|
||||
Ok(Self { file, log_path })
|
||||
}
|
||||
|
||||
async fn log(&mut self, msg: &str) {
|
||||
let line = format!("[{}] {}\n", Local::now().format("%H:%M:%S"), msg);
|
||||
let _ = self.file.write_all(line.as_bytes());
|
||||
let _ = self.file.flush();
|
||||
println!("{}", line.trim_end());
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn init_debug_logger(log_dir: &std::path::Path) -> Result<(), String> {
|
||||
let mut logger = LOGGER.lock().await;
|
||||
match DebugLogger::new(log_dir) {
|
||||
Ok(l) => {
|
||||
let log_path = l.log_path.clone();
|
||||
*logger = Some(l);
|
||||
println!("✓ Logger initialized at: {:?}", log_path);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
let err_msg = format!("Failed to initialize logger: {}", e);
|
||||
eprintln!("{}", err_msg);
|
||||
Err(err_msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn log_message(msg: &str) {
|
||||
let mut logger = LOGGER.lock().await;
|
||||
if let Some(l) = logger.as_mut() {
|
||||
l.log(msg).await;
|
||||
} else {
|
||||
println!("[LOG] {}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn log_detailed(level: &str, msg: &str) {
|
||||
let formatted = format!("[{}] {}", level, msg);
|
||||
log_message(&formatted).await;
|
||||
}
|
||||
|
||||
pub async fn log_info(msg: &str) {
|
||||
log_detailed("INFO", msg).await;
|
||||
}
|
||||
|
||||
pub async fn log_warn(msg: &str) {
|
||||
log_detailed("WARN", msg).await;
|
||||
}
|
||||
|
||||
pub async fn log_error(msg: &str) {
|
||||
log_detailed("ERROR", msg).await;
|
||||
}
|
||||
4
src/util/mod.rs
Normal file
4
src/util/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
// src/util/mod.rs
|
||||
pub mod logger;
|
||||
pub mod directories;
|
||||
pub mod opnv;
|
||||
281
src/util/opnv.rs
Normal file
281
src/util/opnv.rs
Normal file
@@ -0,0 +1,281 @@
|
||||
// src/scraper/opnv.rs
|
||||
|
||||
//! Module for fetching, downloading, and extracting OpenVPN configurations from VPNBook.
|
||||
//!
|
||||
//! This module provides functionality to scrape the VPNBook free VPN page using
|
||||
//! a headless browser, handle potential consent popups, extract current credentials,
|
||||
//! collect download URLs for OpenVPN ZIP files, download them, and then extract
|
||||
//! the .ovpn files into a structured directory: cache/openvpn/<hostname>/<ovpn_filename>.
|
||||
//! It is designed to fetch the most recent data on every run, as credentials and
|
||||
//! server configurations change periodically.
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use fantoccini::{Client, Locator};
|
||||
use reqwest;
|
||||
use std::io::{Read};
|
||||
use std::path::{Path, PathBuf};
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use url::Url;
|
||||
use zip::ZipArchive;
|
||||
use crate::scraper::webdriver::{ChromeDriverPool, ScrapeTask};
|
||||
use crate::util::{directories::DataPaths};
|
||||
|
||||
/// Fetches, downloads, and extracts the latest OpenVPN configurations from VPNBook.
|
||||
///
|
||||
/// This asynchronous function uses the provided `ChromeDriverPool` to scrape the
|
||||
/// VPNBook free VPN page. It dismisses any consent popup if present, extracts the
|
||||
/// current username and password, collects all OpenVPN ZIP download URLs, downloads
|
||||
/// the ZIP files temporarily, extracts the .ovpn files into the specified directory
|
||||
/// structure under `cache_dir`/openvpn/<hostname>/, and cleans up the ZIP files.
|
||||
///
|
||||
/// The directory structure is: cache/openvpn/<hostname>/<ovpn_filename>, where
|
||||
/// <hostname> is derived from the ZIP filename (e.g., "ca149.vpnbook.com").
|
||||
///
|
||||
/// The function ensures fresh data is fetched each time it runs, making it suitable
|
||||
/// for periodic updates where credentials may change.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `pool` - A reference to the `ChromeDriverPool` for managing browser instances.
|
||||
/// * `cache_dir` - The path to the base cache directory. The OpenVPN files will be saved
|
||||
/// under `cache_dir`/openvpn/<hostname>/.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Result` containing a tuple with:
|
||||
/// - `String`: The scraped username.
|
||||
/// - `String`: The scraped password.
|
||||
/// - `Vec<PathBuf>`: Paths to the extracted .ovpn files.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an `anyhow::Error` if:
|
||||
/// - Navigation to the page fails.
|
||||
/// - The consent popup cannot be dismissed (if present).
|
||||
/// - Credentials cannot be parsed from the page.
|
||||
/// - Download URLs cannot be found or are invalid.
|
||||
/// - HTTP downloads fail or file writing errors occur.
|
||||
/// - ZIP extraction fails (e.g., invalid ZIP or I/O errors).
|
||||
///
|
||||
/// # Dependencies
|
||||
///
|
||||
/// This function requires the following crates (add to Cargo.toml if not present):
|
||||
/// - `anyhow` for error handling.
|
||||
/// - `fantoccini` for browser automation.
|
||||
/// - `reqwest` (with `tokio` features) for HTTP downloads.
|
||||
/// - `tokio` for asynchronous file operations.
|
||||
/// - `url` for URL manipulation.
|
||||
/// - `zip` for ZIP extraction.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// use anyhow::Result;
|
||||
/// use event_backtest_engine::scraper::opnv::fetch_vpnbook_configs;
|
||||
/// use event_backtest_engine::scraper::webdriver::ChromeDriverPool;
|
||||
/// use std::path::Path;
|
||||
///
|
||||
/// #[tokio::main]
|
||||
/// async fn main() -> Result<()> {
|
||||
/// let pool = ChromeDriverPool::new(1).await?;
|
||||
/// let (username, password, files) =
|
||||
/// fetch_vpnbook_configs(&pool, Path::new("./cache")).await?;
|
||||
/// println!("Username: {}, Password: {}", username, password);
|
||||
/// for file in files {
|
||||
/// println!("Extracted: {:?}", file);
|
||||
/// }
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub async fn fetch_vpnbook_configs(
|
||||
pool: &ChromeDriverPool,
|
||||
cache_dir: &Path,
|
||||
) -> Result<(String, String, Vec<PathBuf>)> {
|
||||
// Prepare the openvpn directory
|
||||
let dir = DataPaths::new(".")?;
|
||||
let vpn_dir = dir.cache_openvpn_dir();
|
||||
tokio::fs::create_dir_all(&vpn_dir)
|
||||
.await
|
||||
.context("Failed to create openvpn directory")?;
|
||||
|
||||
// Temporary directory for ZIP downloads (under cache for consistency)
|
||||
let temp_dir = cache_dir.join("temp_vpn_zips");
|
||||
tokio::fs::create_dir_all(&temp_dir)
|
||||
.await
|
||||
.context("Failed to create temp directory")?;
|
||||
|
||||
let url = "https://www.vpnbook.com/freevpn".to_string();
|
||||
|
||||
// Define the scraping task
|
||||
let task = ScrapeTask::new(url, |client: Client| async move {
|
||||
// Attempt to dismiss consent popup if present
|
||||
let consent_selector = r#"body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-do-not-consent.fc-secondary-button > p"#;
|
||||
if let Ok(consent_elem) = client.find(Locator::Css(consent_selector)).await {
|
||||
consent_elem
|
||||
.click()
|
||||
.await
|
||||
.context("Failed to click consent dismissal button")?;
|
||||
// Brief delay to allow popup to close
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
}
|
||||
|
||||
// Find all <code> elements
|
||||
let codes = client
|
||||
.find_all(Locator::Css("code"))
|
||||
.await
|
||||
.context("Failed to find code elements")?;
|
||||
|
||||
if codes.len() < 2 {
|
||||
return Err(anyhow!("Insufficient code elements found for credentials"));
|
||||
}
|
||||
|
||||
// The first <code> is username, second is password
|
||||
let username = codes[0]
|
||||
.text()
|
||||
.await
|
||||
.context("Failed to get username text")?;
|
||||
|
||||
let password = codes[1]
|
||||
.text()
|
||||
.await
|
||||
.context("Failed to get password text")?;
|
||||
|
||||
// Locate all download links for OpenVPN ZIP files
|
||||
let links = client
|
||||
.find_all(Locator::Css(r#"a[href^="/free-openvpn-account/"][download=""]"#))
|
||||
.await
|
||||
.context("Failed to find download links")?;
|
||||
|
||||
// Collect relative hrefs
|
||||
let mut rel_urls = Vec::new();
|
||||
for link in links {
|
||||
if let Some(href) = link.attr("href").await.context("Failed to get href attribute")? {
|
||||
rel_urls.push(href);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((username, password, rel_urls))
|
||||
});
|
||||
|
||||
// Execute the scraping task using the pool
|
||||
let (username, password, rel_urls) = task.execute_with_pool(pool).await?;
|
||||
|
||||
// Base URL for resolving relative paths
|
||||
let base_url = Url::parse("https://www.vpnbook.com/")?;
|
||||
|
||||
// Download each ZIP file to temp_dir
|
||||
let mut zip_paths = Vec::new();
|
||||
for rel in &rel_urls {
|
||||
let full_url = base_url.join(rel).context("Failed to join URL")?;
|
||||
let filename = rel
|
||||
.split('/')
|
||||
.last()
|
||||
.ok_or_else(|| anyhow!("Invalid filename in URL"))?
|
||||
.to_string();
|
||||
let out_path = temp_dir.join(&filename);
|
||||
|
||||
// Perform HTTP GET request
|
||||
let resp = reqwest::get(full_url.clone())
|
||||
.await
|
||||
.with_context(|| format!("Failed to send download request for {}", full_url))?;
|
||||
|
||||
if resp.status().is_success() {
|
||||
let bytes = resp
|
||||
.bytes()
|
||||
.await
|
||||
.context("Failed to read response bytes")?;
|
||||
|
||||
// Write to file asynchronously
|
||||
let mut file = File::create(&out_path)
|
||||
.await
|
||||
.context("Failed to create output file")?;
|
||||
file.write_all(&bytes)
|
||||
.await
|
||||
.context("Failed to write to file")?;
|
||||
|
||||
zip_paths.push(out_path);
|
||||
} else {
|
||||
return Err(anyhow!(
|
||||
"Download failed with status: {} for URL: {}",
|
||||
resp.status(),
|
||||
full_url
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Now extract .ovpn files from each ZIP
|
||||
let mut extracted_paths = Vec::new();
|
||||
for zip_path in zip_paths {
|
||||
let hostname = get_hostname_from_zip_filename(
|
||||
zip_path.file_name().unwrap().to_str().unwrap(),
|
||||
);
|
||||
let hostname_dir = vpn_dir.join(&hostname);
|
||||
tokio::fs::create_dir_all(&hostname_dir)
|
||||
.await
|
||||
.context("Failed to create hostname directory")?;
|
||||
|
||||
// Use spawn_blocking for sync ZIP operations
|
||||
let zip_path_clone = zip_path.clone();
|
||||
let hostname_dir_clone = hostname_dir.clone();
|
||||
let extract_result = tokio::task::spawn_blocking(move || {
|
||||
let file = std::fs::File::open(&zip_path_clone)
|
||||
.with_context(|| format!("Failed to open ZIP file: {:?}", zip_path_clone))?;
|
||||
let mut archive = ZipArchive::new(file)
|
||||
.with_context(|| format!("Failed to read ZIP archive: {:?}", zip_path_clone))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for i in 0..archive.len() {
|
||||
let mut zip_file = archive.by_index(i)?;
|
||||
if zip_file.name().ends_with(".ovpn") {
|
||||
// Get just the filename, stripping any path
|
||||
let file_name = Path::new(zip_file.name()).file_name()
|
||||
.ok_or_else(|| anyhow!("Invalid file name in ZIP: {}", zip_file.name()))?
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("Invalid UTF-8 in file name: {}", zip_file.name()))?
|
||||
.to_string();
|
||||
let target_path = hostname_dir_clone.join(file_name);
|
||||
let mut content = Vec::new();
|
||||
zip_file.read_to_end(&mut content)?;
|
||||
|
||||
std::fs::write(&target_path, &content)
|
||||
.with_context(|| format!("Failed to write .ovpn file: {:?}", target_path))?;
|
||||
paths.push(target_path);
|
||||
}
|
||||
}
|
||||
Ok::<Vec<PathBuf>, anyhow::Error>(paths)
|
||||
})
|
||||
.await
|
||||
.context("Spawn blocking failed")??;
|
||||
|
||||
extracted_paths.extend(extract_result);
|
||||
|
||||
// Clean up the ZIP file after extraction
|
||||
tokio::fs::remove_file(&zip_path)
|
||||
.await
|
||||
.context("Failed to remove temp ZIP file")?;
|
||||
}
|
||||
|
||||
// Optional: Clean up temp_dir if empty
|
||||
let _ = tokio::fs::remove_dir(&temp_dir).await;
|
||||
|
||||
Ok((username, password, extracted_paths))
|
||||
}
|
||||
|
||||
/// Derives the hostname from the ZIP filename.
|
||||
///
|
||||
/// For example, "vpnbook-openvpn-ca149.zip" -> "ca149.vpnbook.com"
|
||||
///
|
||||
/// If the format doesn't match, returns "unknown.vpnbook.com".
|
||||
fn get_hostname_from_zip_filename(filename: &str) -> String {
|
||||
if filename.starts_with("vpnbook-openvpn-") && filename.ends_with(".zip") {
|
||||
let code = filename
|
||||
.strip_prefix("vpnbook-openvpn-")
|
||||
.unwrap()
|
||||
.strip_suffix(".zip")
|
||||
.unwrap();
|
||||
format!("{}.vpnbook.com", code)
|
||||
} else {
|
||||
"unknown.vpnbook.com".to_string()
|
||||
}
|
||||
}
|
||||
379
test/vpn_integration_tests.rs
Normal file
379
test/vpn_integration_tests.rs
Normal file
@@ -0,0 +1,379 @@
|
||||
// tests/vpn_integration_tests.rs
|
||||
//! Integration tests for VPN rotation system
|
||||
|
||||
#[cfg(test)]
|
||||
mod vpn_tests {
|
||||
use event_backtest_engine::{
|
||||
scraper::{
|
||||
webdriver::ChromeDriverPool,
|
||||
vpn_manager::{VpnInstance, VpnPool},
|
||||
},
|
||||
util::{directories::DataPaths, opnv},
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Helper to create a test VPN instance without connecting
|
||||
fn create_test_vpn_instance() -> VpnInstance {
|
||||
VpnInstance::new(
|
||||
PathBuf::from("test.ovpn"),
|
||||
"testuser".to_string(),
|
||||
"testpass".to_string(),
|
||||
)
|
||||
.expect("Failed to create test VPN instance")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vpn_instance_creation() {
|
||||
let vpn = create_test_vpn_instance();
|
||||
assert_eq!(vpn.hostname(), "test");
|
||||
assert!(!vpn.is_healthy());
|
||||
assert!(vpn.external_ip().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vpn_task_counting() {
|
||||
let mut vpn = create_test_vpn_instance();
|
||||
|
||||
// Should not rotate initially
|
||||
assert!(!vpn.increment_task_count(10));
|
||||
|
||||
// Increment tasks
|
||||
for i in 1..10 {
|
||||
assert!(!vpn.increment_task_count(10), "Should not rotate at task {}", i);
|
||||
}
|
||||
|
||||
// Should rotate at threshold
|
||||
assert!(vpn.increment_task_count(10), "Should rotate at task 10");
|
||||
|
||||
// Reset and verify
|
||||
vpn.reset_task_count();
|
||||
assert!(!vpn.increment_task_count(10), "Should not rotate after reset");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vpn_task_counting_zero_threshold() {
|
||||
let mut vpn = create_test_vpn_instance();
|
||||
|
||||
// With threshold=0, should never auto-rotate
|
||||
for _ in 0..100 {
|
||||
assert!(!vpn.increment_task_count(0));
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chromedriver_pool_creation_no_vpn() {
|
||||
let result = ChromeDriverPool::new(2).await;
|
||||
|
||||
match result {
|
||||
Ok(pool) => {
|
||||
assert_eq!(pool.get_number_of_instances(), 2);
|
||||
assert!(!pool.is_vpn_enabled());
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("ChromeDriver pool creation failed (expected if chromedriver not installed): {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_data_paths_creation() {
|
||||
let paths = DataPaths::new("./test_data").expect("Failed to create paths");
|
||||
|
||||
assert!(paths.data_dir().exists());
|
||||
assert!(paths.cache_dir().exists());
|
||||
assert!(paths.logs_dir().exists());
|
||||
assert!(paths.cache_openvpn_dir().exists());
|
||||
|
||||
// Cleanup
|
||||
let _ = std::fs::remove_dir_all("./test_data");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // This test requires actual network access and VPNBook availability
|
||||
async fn test_fetch_vpnbook_configs() {
|
||||
let paths = DataPaths::new(".").expect("Failed to create paths");
|
||||
|
||||
// This test requires a ChromeDriver pool
|
||||
let pool_result = ChromeDriverPool::new(1).await;
|
||||
if pool_result.is_err() {
|
||||
eprintln!("Skipping VPNBook fetch test: ChromeDriver not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let pool = Arc::new(pool_result.unwrap());
|
||||
|
||||
let result = opnv::fetch_vpnbook_configs(&pool, paths.cache_dir()).await;
|
||||
|
||||
match result {
|
||||
Ok((username, password, files)) => {
|
||||
assert!(!username.is_empty(), "Username should not be empty");
|
||||
assert!(!password.is_empty(), "Password should not be empty");
|
||||
assert!(!files.is_empty(), "Should fetch at least one config file");
|
||||
|
||||
println!("Fetched {} VPN configs", files.len());
|
||||
for file in &files {
|
||||
assert!(file.exists(), "Config file should exist: {:?}", file);
|
||||
assert_eq!(file.extension().and_then(|s| s.to_str()), Some("ovpn"));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("VPNBook fetch failed (may be temporary): {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Requires actual VPN configs and OpenVPN installation
|
||||
async fn test_vpn_pool_creation() {
|
||||
let paths = DataPaths::new(".").expect("Failed to create paths");
|
||||
|
||||
// First fetch configs
|
||||
let pool_result = ChromeDriverPool::new(1).await;
|
||||
if pool_result.is_err() {
|
||||
eprintln!("Skipping VPN pool test: ChromeDriver not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_pool = Arc::new(pool_result.unwrap());
|
||||
let fetch_result = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await;
|
||||
|
||||
if fetch_result.is_err() {
|
||||
eprintln!("Skipping VPN pool test: Could not fetch configs");
|
||||
return;
|
||||
}
|
||||
|
||||
let (username, password, _) = fetch_result.unwrap();
|
||||
|
||||
// Create VPN pool
|
||||
let vpn_pool_result = VpnPool::new(
|
||||
paths.cache_openvpn_dir(),
|
||||
username,
|
||||
password,
|
||||
false,
|
||||
0,
|
||||
).await;
|
||||
|
||||
match vpn_pool_result {
|
||||
Ok(vpn_pool) => {
|
||||
assert!(vpn_pool.len() > 0, "VPN pool should have at least one instance");
|
||||
println!("Created VPN pool with {} instances", vpn_pool.len());
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("VPN pool creation failed: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Full integration test - requires all components
|
||||
async fn test_full_vpn_integration() {
|
||||
let paths = DataPaths::new(".").expect("Failed to create paths");
|
||||
|
||||
// Step 1: Create temp ChromeDriver pool for fetching
|
||||
let temp_pool = match ChromeDriverPool::new(1).await {
|
||||
Ok(p) => Arc::new(p),
|
||||
Err(e) => {
|
||||
eprintln!("Skipping integration test: ChromeDriver not available - {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// Step 2: Fetch VPNBook configs
|
||||
let (username, password, files) = match opnv::fetch_vpnbook_configs(
|
||||
&temp_pool,
|
||||
paths.cache_dir()
|
||||
).await {
|
||||
Ok(result) => result,
|
||||
Err(e) => {
|
||||
eprintln!("Skipping integration test: Config fetch failed - {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
assert!(!files.is_empty(), "Should have fetched configs");
|
||||
|
||||
// Step 3: Create VPN pool
|
||||
let vpn_pool = match VpnPool::new(
|
||||
paths.cache_openvpn_dir(),
|
||||
username,
|
||||
password,
|
||||
true,
|
||||
5,
|
||||
).await {
|
||||
Ok(pool) => Arc::new(pool),
|
||||
Err(e) => {
|
||||
eprintln!("Skipping integration test: VPN pool creation failed - {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// Step 4: Connect one VPN
|
||||
let vpn_instance = vpn_pool.acquire().await.expect("Failed to acquire VPN");
|
||||
let connect_result = {
|
||||
let mut vpn = vpn_instance.lock().await;
|
||||
vpn.connect().await
|
||||
};
|
||||
|
||||
match connect_result {
|
||||
Ok(_) => {
|
||||
let vpn = vpn_instance.lock().await;
|
||||
println!("✓ VPN connected: {} ({})",
|
||||
vpn.hostname(),
|
||||
vpn.external_ip().unwrap_or("unknown")
|
||||
);
|
||||
assert!(vpn.is_healthy());
|
||||
assert!(vpn.external_ip().is_some());
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("VPN connection failed: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Create ChromeDriver pool with VPN
|
||||
let driver_pool_result = ChromeDriverPool::new_with_vpn(
|
||||
1,
|
||||
Some(vpn_pool.clone())
|
||||
).await;
|
||||
|
||||
match driver_pool_result {
|
||||
Ok(driver_pool) => {
|
||||
assert!(driver_pool.is_vpn_enabled());
|
||||
println!("✓ ChromeDriver pool created with VPN binding");
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("ChromeDriver pool creation failed: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 6: Cleanup
|
||||
vpn_pool.disconnect_all().await.expect("Failed to disconnect VPNs");
|
||||
println!("✓ Integration test complete");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hostname_extraction() {
|
||||
// Test the hostname extraction logic
|
||||
let test_cases = vec![
|
||||
("test/ca149.vpnbook.com/config.ovpn", "ca149.vpnbook.com"),
|
||||
("test/us1.vpnbook.com/config.ovpn", "us1.vpnbook.com"),
|
||||
("test/de4.vpnbook.com/config.ovpn", "de4.vpnbook.com"),
|
||||
];
|
||||
|
||||
for (path, expected_hostname) in test_cases {
|
||||
let pb = PathBuf::from(path);
|
||||
let hostname = pb.parent()
|
||||
.and_then(|p| p.file_name())
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("unknown");
|
||||
|
||||
assert_eq!(hostname, expected_hostname);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
#[test]
|
||||
fn test_forcebindip_manager_creation() {
|
||||
use event_backtest_engine::ForceBindIpManager;
|
||||
|
||||
match ForceBindIpManager::new() {
|
||||
Ok(manager) => {
|
||||
println!("✓ ForceBindIP found at: {:?}", manager.path());
|
||||
assert!(manager.path().exists());
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("ForceBindIP not found (expected in dev): {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
#[test]
|
||||
fn test_forcebindip_command_creation() {
|
||||
use event_backtest_engine::ForceBindIpManager;
|
||||
use std::path::Path;
|
||||
|
||||
if let Ok(manager) = ForceBindIpManager::new() {
|
||||
let cmd = manager.create_bound_command(
|
||||
"192.168.1.100",
|
||||
Path::new("test.exe"),
|
||||
&["--arg1", "value1"],
|
||||
);
|
||||
|
||||
let cmd_str = format!("{:?}", cmd);
|
||||
assert!(cmd_str.contains("192.168.1.100"));
|
||||
assert!(cmd_str.contains("test.exe"));
|
||||
println!("✓ ForceBindIP command created successfully");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_defaults() {
|
||||
use event_backtest_engine::Config;
|
||||
|
||||
let config = Config::default();
|
||||
assert_eq!(config.economic_start_date, "2007-02-13");
|
||||
assert_eq!(config.corporate_start_date, "2010-01-01");
|
||||
assert_eq!(config.economic_lookahead_months, 3);
|
||||
assert_eq!(config.max_parallel_instances, 10);
|
||||
assert!(!config.enable_vpn_rotation);
|
||||
assert_eq!(config.tasks_per_vpn_session, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod benchmark_tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Performance test
|
||||
async fn benchmark_vpn_rotation_overhead() {
|
||||
use std::time::Instant;
|
||||
|
||||
// This test measures the overhead of VPN rotation
|
||||
let start = Instant::now();
|
||||
|
||||
// Simulate rotation cycle
|
||||
// 1. Disconnect (instant)
|
||||
// 2. Wait 2 seconds
|
||||
// 3. Connect (5-10 seconds)
|
||||
// 4. Verify IP (1-2 seconds)
|
||||
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
println!("Rotation cycle took: {:?}", elapsed);
|
||||
|
||||
// Typical rotation should complete in under 15 seconds
|
||||
assert!(elapsed.as_secs() < 15);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Performance test
|
||||
async fn benchmark_parallel_scraping() {
|
||||
// This test measures throughput with different parallelism levels
|
||||
// Results help tune MAX_PARALLEL_INSTANCES
|
||||
|
||||
let configs = vec![1, 2, 3, 5, 10];
|
||||
|
||||
for &pool_size in &configs {
|
||||
println!("Testing with {} parallel instances...", pool_size);
|
||||
|
||||
// Would need actual scraping implementation here
|
||||
// For now, just verify pool creation time
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let pool_result = event_backtest_engine::ChromeDriverPool::new(pool_size).await;
|
||||
|
||||
if let Ok(_pool) = pool_result {
|
||||
let elapsed = start.elapsed();
|
||||
println!(" Pool initialization: {:?}", elapsed);
|
||||
|
||||
// Pool creation should be fast (< 5 seconds per instance)
|
||||
assert!(elapsed.as_secs() < pool_size as u64 * 5);
|
||||
} else {
|
||||
eprintln!(" Skipped - ChromeDriver not available");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user