Compare commits
2 Commits
c9da56e8e9
...
1bda78897b
| Author | SHA1 | Date | |
|---|---|---|---|
| 1bda78897b | |||
| 470f0922ed |
60
Cargo.lock
generated
60
Cargo.lock
generated
@@ -680,10 +680,10 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
"toml",
|
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"url",
|
"url",
|
||||||
|
"walkdir",
|
||||||
"yfinance-rs",
|
"yfinance-rs",
|
||||||
"zip",
|
"zip",
|
||||||
]
|
]
|
||||||
@@ -2530,6 +2530,15 @@ version = "1.0.20"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "same-file"
|
||||||
|
version = "1.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "schannel"
|
name = "schannel"
|
||||||
version = "0.1.28"
|
version = "0.1.28"
|
||||||
@@ -2675,15 +2684,6 @@ dependencies = [
|
|||||||
"serde_core",
|
"serde_core",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde_spanned"
|
|
||||||
version = "1.0.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e24345aa0fe688594e73770a5f6d1b216508b4f93484c0026d521acd30134392"
|
|
||||||
dependencies = [
|
|
||||||
"serde_core",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_urlencoded"
|
name = "serde_urlencoded"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
@@ -3129,21 +3129,6 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "toml"
|
|
||||||
version = "0.9.8"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f0dc8b1fb61449e27716ec0e1bdf0f6b8f3e8f6b05391e8497b8b6d7804ea6d8"
|
|
||||||
dependencies = [
|
|
||||||
"indexmap",
|
|
||||||
"serde_core",
|
|
||||||
"serde_spanned",
|
|
||||||
"toml_datetime",
|
|
||||||
"toml_parser",
|
|
||||||
"toml_writer",
|
|
||||||
"winnow",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml_datetime"
|
name = "toml_datetime"
|
||||||
version = "0.7.3"
|
version = "0.7.3"
|
||||||
@@ -3174,12 +3159,6 @@ dependencies = [
|
|||||||
"winnow",
|
"winnow",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "toml_writer"
|
|
||||||
version = "1.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tower"
|
name = "tower"
|
||||||
version = "0.5.2"
|
version = "0.5.2"
|
||||||
@@ -3393,6 +3372,16 @@ version = "0.9.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "walkdir"
|
||||||
|
version = "2.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
|
||||||
|
dependencies = [
|
||||||
|
"same-file",
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "want"
|
name = "want"
|
||||||
version = "0.3.1"
|
version = "0.3.1"
|
||||||
@@ -3524,6 +3513,15 @@ dependencies = [
|
|||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-util"
|
||||||
|
version = "0.1.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-core"
|
name = "windows-core"
|
||||||
version = "0.62.2"
|
version = "0.62.2"
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "event_backtest_engine"
|
name = "event_backtest_engine"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2024"
|
||||||
authors = ["Your Name <you@example.com>"]
|
authors = ["Your Name <you@example.com>"]
|
||||||
description = "High-impact economic & corporate earnings data collector for short-event backtesting (overnight/weekend gaps)"
|
description = "High-impact economic & corporate earnings data collector for short-event backtesting (overnight/weekend gaps)"
|
||||||
license = "MIT OR Apache-2.0"
|
license = "MIT OR Apache-2.0"
|
||||||
@@ -30,15 +30,15 @@ csv = "1.3"
|
|||||||
zip = "6.0.0"
|
zip = "6.0.0"
|
||||||
flate2 = "1.1.5"
|
flate2 = "1.1.5"
|
||||||
|
|
||||||
#
|
# Formatting
|
||||||
regex = "1.12.2"
|
regex = "1.12.2"
|
||||||
|
walkdir = "2"
|
||||||
|
|
||||||
# Generating
|
# Generating
|
||||||
rand = "0.9.2"
|
rand = "0.9.2"
|
||||||
|
|
||||||
# Environment handling
|
# Environment handling
|
||||||
dotenvy = "0.15"
|
dotenvy = "0.15"
|
||||||
toml = "0.9.8"
|
|
||||||
|
|
||||||
# Date & time
|
# Date & time
|
||||||
chrono = { version = "0.4", features = ["serde"] }
|
chrono = { version = "0.4", features = ["serde"] }
|
||||||
|
|||||||
BIN
event_backtest_engine.exe
Normal file
BIN
event_backtest_engine.exe
Normal file
Binary file not shown.
@@ -22,12 +22,6 @@ pub struct Config {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub enable_vpn_rotation: bool,
|
pub enable_vpn_rotation: bool,
|
||||||
|
|
||||||
/// Comma-separated list of VPN servers/country codes to rotate through.
|
|
||||||
/// Example: "US-Free#1,UK-Free#1,JP-Free#1" or "US,JP,DE"
|
|
||||||
/// If empty, VPN rotation is disabled.
|
|
||||||
#[serde(default)]
|
|
||||||
pub vpn_servers: String,
|
|
||||||
|
|
||||||
/// Number of tasks per session before rotating VPN
|
/// Number of tasks per session before rotating VPN
|
||||||
/// If set to 0, rotates VPN between economic and corporate phases
|
/// If set to 0, rotates VPN between economic and corporate phases
|
||||||
#[serde(default = "default_tasks_per_session")]
|
#[serde(default = "default_tasks_per_session")]
|
||||||
@@ -51,7 +45,6 @@ impl Default for Config {
|
|||||||
max_parallel_instances: default_max_parallel_instances(),
|
max_parallel_instances: default_max_parallel_instances(),
|
||||||
max_tasks_per_instance: 0,
|
max_tasks_per_instance: 0,
|
||||||
enable_vpn_rotation: false,
|
enable_vpn_rotation: false,
|
||||||
vpn_servers: String::new(),
|
|
||||||
tasks_per_vpn_session: default_tasks_per_session(),
|
tasks_per_vpn_session: default_tasks_per_session(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -100,9 +93,6 @@ impl Config {
|
|||||||
.parse::<bool>()
|
.parse::<bool>()
|
||||||
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
||||||
|
|
||||||
let vpn_servers = dotenvy::var("VPN_SERVERS")
|
|
||||||
.unwrap_or_else(|_| String::new());
|
|
||||||
|
|
||||||
let tasks_per_vpn_session: usize = dotenvy::var("TASKS_PER_VPN_SESSION")
|
let tasks_per_vpn_session: usize = dotenvy::var("TASKS_PER_VPN_SESSION")
|
||||||
.unwrap_or_else(|_| "0".to_string())
|
.unwrap_or_else(|_| "0".to_string())
|
||||||
.parse()
|
.parse()
|
||||||
@@ -115,7 +105,6 @@ impl Config {
|
|||||||
max_parallel_instances,
|
max_parallel_instances,
|
||||||
max_tasks_per_instance,
|
max_tasks_per_instance,
|
||||||
enable_vpn_rotation,
|
enable_vpn_rotation,
|
||||||
vpn_servers,
|
|
||||||
tasks_per_vpn_session,
|
tasks_per_vpn_session,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,4 +5,11 @@
|
|||||||
|
|
||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod scraper;
|
pub mod scraper;
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
|
||||||
|
// Re-export commonly used types for convenience
|
||||||
|
pub use config::Config;
|
||||||
|
pub use scraper::webdriver::{ChromeDriverPool, ChromeInstance, ScrapeTask};
|
||||||
|
pub use util::directories::DataPaths;
|
||||||
|
pub use util::logger;
|
||||||
|
pub use util::opnv;
|
||||||
184
src/main.rs
184
src/main.rs
@@ -1,4 +1,5 @@
|
|||||||
// src/main.rs
|
// src/main.rs
|
||||||
|
|
||||||
mod config;
|
mod config;
|
||||||
mod corporate;
|
mod corporate;
|
||||||
mod economic;
|
mod economic;
|
||||||
@@ -7,73 +8,162 @@ mod scraper;
|
|||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use config::Config;
|
use config::Config;
|
||||||
|
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
|
||||||
use scraper::webdriver::ChromeDriverPool;
|
use scraper::webdriver::ChromeDriverPool;
|
||||||
use util::directories::DataPaths;
|
use util::directories::DataPaths;
|
||||||
use util::{logger, opnv};
|
use util::{logger, opnv};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
/// The entry point of the application.
|
/// Application entry point
|
||||||
///
|
// src/main.rs
|
||||||
/// This function loads the configuration, initializes a shared ChromeDriver pool,
|
|
||||||
/// fetches the latest VPNBook OpenVPN configurations if VPN rotation is enabled,
|
// ... existing imports ...
|
||||||
/// and sequentially runs the full updates for corporate and economic data.
|
|
||||||
/// Sequential execution helps prevent resource exhaustion from concurrent
|
|
||||||
/// chromedriver instances and avoids spamming the target websites with too many requests.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
///
|
|
||||||
/// Returns an error if configuration loading fails, pool initialization fails,
|
|
||||||
/// VPN fetching fails (if enabled), or if either update function encounters an issue
|
|
||||||
/// (e.g., network errors, scraping failures, or chromedriver spawn failures like "program not found").
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
|
||||||
|
// Load configuration from .env
|
||||||
let config = Config::load().map_err(|err| {
|
let config = Config::load().map_err(|err| {
|
||||||
println!("Failed to load Config .env: {}", err);
|
eprintln!("Failed to load config: {}", err);
|
||||||
err
|
err
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// Initialize paths
|
// Initialize paths and logger
|
||||||
let paths = DataPaths::new(".")?;
|
let paths = DataPaths::new(".")?;
|
||||||
|
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
||||||
// Initialize logger
|
logger::log_info("=== Event Backtest Engine Started ===").await;
|
||||||
logger::init_debug_logger(paths.logs_dir()).await.map_err(|e| {
|
logger::log_info(&format!(
|
||||||
anyhow::anyhow!("Logger initialization failed: {}", e)
|
"Config → parallel_instances: {}, task_limit: {} vpn_rotation: {}",
|
||||||
})?;
|
config.max_parallel_instances,
|
||||||
|
config.max_tasks_per_instance,
|
||||||
|
config.enable_vpn_rotation
|
||||||
|
)).await;
|
||||||
|
|
||||||
logger::log_info("=== Application started ===").await;
|
// === Step 1: Fetch fresh VPNBook credentials and .ovpn files (if rotation enabled) ===
|
||||||
logger::log_info(&format!("Config: economic_start_date={}, corporate_start_date={}, lookahead_months={}, max_parallel_instances={}, enable_vpn_rotation={}",
|
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
|
||||||
config.economic_start_date, config.corporate_start_date, config.economic_lookahead_months, config.max_parallel_instances, config.enable_vpn_rotation)).await;
|
logger::log_info("VPN Rotation Enabled — Fetching latest VPNBook configs").await;
|
||||||
|
|
||||||
// Initialize the shared ChromeDriver pool once
|
// We only need 1 Chrome instance to scrape vpnbook.com (no proxy yet)
|
||||||
let pool_size = config.max_parallel_instances;
|
let temp_pool = Arc::new(ChromeDriverPool::new(1).await?);
|
||||||
logger::log_info(&format!("Initializing ChromeDriver pool with size: {}", pool_size)).await;
|
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
||||||
|
|
||||||
let pool = Arc::new(ChromeDriverPool::new(pool_size).await?);
|
|
||||||
logger::log_info("✓ ChromeDriver pool initialized successfully").await;
|
|
||||||
|
|
||||||
// Fetch VPNBook configs if VPN rotation is enabled
|
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
||||||
if config.enable_vpn_rotation {
|
|
||||||
logger::log_info("--- Fetching latest VPNBook OpenVPN configurations ---").await;
|
// Count how many distinct servers (subfolders) we have in cache/openvpn/
|
||||||
let (username, password, files) =
|
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
||||||
util::opnv::fetch_vpnbook_configs(&pool, paths.cache_dir()).await?;
|
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
||||||
logger::log_info(&format!("Fetched VPN username: {}, password: {}", username, password)).await;
|
.count();
|
||||||
for file in &files {
|
|
||||||
logger::log_info(&format!("Extracted OVPN: {:?}", file)).await;
|
if server_count == 0 {
|
||||||
|
logger::log_warn("No VPN servers found — continuing without VPN").await;
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
logger::log_info(&format!("Found {} VPN servers — starting Docker proxy containers", server_count)).await;
|
||||||
|
|
||||||
|
let pp = Arc::new(
|
||||||
|
DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password).await?
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify all proxies are working before proceeding
|
||||||
|
logger::log_info("Verifying all proxy connections...").await;
|
||||||
|
let mut all_working = true;
|
||||||
|
for i in 0..pp.num_proxies() {
|
||||||
|
match pp.test_proxy_connection(i).await {
|
||||||
|
Ok(ip) => {
|
||||||
|
logger::log_info(&format!(" Proxy {}: working with IP: {}", i + 1, ip)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!(" Proxy {}: FAILED - {}", i + 1, e)).await;
|
||||||
|
all_working = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !all_working {
|
||||||
|
logger::log_warn("Some proxies failed, but continuing with working ones...").await;
|
||||||
|
} else {
|
||||||
|
logger::log_info("All proxies verified and ready!").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
|
||||||
|
Some(pp)
|
||||||
}
|
}
|
||||||
// Optionally, store username/password for rotation use (e.g., in a file or global state)
|
} else {
|
||||||
// For now, just log them; extend as needed for rotation integration
|
logger::log_info("VPN rotation disabled — using direct connection").await;
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// === Step 2: Initialize the main ChromeDriver pool (with proxy if enabled) ===
|
||||||
|
let pool_size = config.max_parallel_instances;
|
||||||
|
let task_limit = config.max_tasks_per_instance;
|
||||||
|
|
||||||
|
logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", pool_size)).await;
|
||||||
|
|
||||||
|
let pool = Arc::new(
|
||||||
|
if task_limit > 0 {
|
||||||
|
ChromeDriverPool::new_with_proxy_and_task_limit(pool_size, proxy_pool.clone(), task_limit).await?
|
||||||
|
} else {
|
||||||
|
ChromeDriverPool::new_with_proxy(pool_size, proxy_pool.clone()).await?
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
logger::log_info(&format!("ChromeDriver pool ready with {} instances", pool_size)).await;
|
||||||
|
|
||||||
|
// === Step 3: Graceful Ctrl+C handler ===
|
||||||
|
{
|
||||||
|
let pool_clone = Arc::clone(&pool);
|
||||||
|
let proxy_clone = proxy_pool.clone();
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
tokio::signal::ctrl_c().await.ok();
|
||||||
|
|
||||||
|
logger::log_info("Ctrl+C received — shutting down gracefully...").await;
|
||||||
|
|
||||||
|
// Now works: &*pool_clone derefs Arc → &ChromeDriverPool
|
||||||
|
if let Err(e) = (&*pool_clone).shutdown().await {
|
||||||
|
logger::log_error(&format!("Error during pool shutdown: {}", e)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(pp) = proxy_clone {
|
||||||
|
if let Err(e) = pp.shutdown().await {
|
||||||
|
logger::log_warn(&format!("Failed to stop Docker containers: {}", e)).await;
|
||||||
|
} else {
|
||||||
|
logger::log_info("All Docker VPN containers stopped").await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ = cleanup_all_proxy_containers().await;
|
||||||
|
|
||||||
|
std::process::exit(0);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run economic update first, passing the shared pool
|
// === Step 4: Run the actual scraping jobs ===
|
||||||
logger::log_info("--- Starting economic data update ---").await;
|
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
||||||
economic::run_full_update(&config, &pool).await?;
|
economic::run_full_update(&config, &pool).await?;
|
||||||
logger::log_info("✓ Economic data update completed").await;
|
logger::log_info("Economic update completed").await;
|
||||||
|
|
||||||
// Then run corporate update, passing the shared pool
|
logger::log_info("--- Starting CORPORATE data update ---").await;
|
||||||
logger::log_info("--- Starting corporate data update ---").await;
|
|
||||||
corporate::run_full_update(&config, &pool).await?;
|
corporate::run_full_update(&config, &pool).await?;
|
||||||
logger::log_info("✓ Corporate data update completed").await;
|
logger::log_info("Corporate update completed").await;
|
||||||
|
|
||||||
logger::log_info("=== Application completed successfully ===").await;
|
// === Step 5: Final cleanup ===
|
||||||
|
logger::log_info("Shutting down ChromeDriver pool...").await;
|
||||||
|
pool.shutdown().await?;
|
||||||
|
|
||||||
|
if let Some(pp) = proxy_pool {
|
||||||
|
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
||||||
|
pp.shutdown().await?;
|
||||||
|
// CLEANUP ANY LEFTOVER CONTAINERS FROM PREVIOUS RUNS
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("=== Application finished successfully ===").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
memory allocation of 4294967296 bytes failed
|
||||||
|
error: process didn't exit successfully: `target\debug\event_backtest_engine.exe` (exit code: 0xc0000409, STATUS_STACK_BUFFER_OVERRUN)
|
||||||
|
*/
|
||||||
407
src/scraper/docker_vpn_proxy.rs
Normal file
407
src/scraper/docker_vpn_proxy.rs
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use futures::future::join_all;
|
||||||
|
use std::{path::{Path, PathBuf}, time::Duration};
|
||||||
|
use tokio::{process::Command, time::{sleep}};
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
|
pub struct DockerVpnProxyPool {
|
||||||
|
container_names: Vec<String>,
|
||||||
|
proxy_ports: Vec<u16>, // e.g., [10801, 10802, ...]
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DockerVpnProxyPool {
|
||||||
|
pub async fn new(ovpn_dir: &Path, username: String, password: String) -> Result<Self> {
|
||||||
|
// Count hostnames (subdirs in ovpn_dir)
|
||||||
|
let hostnames: Vec<_> = std::fs::read_dir(ovpn_dir)?
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.filter(|e| e.path().is_dir())
|
||||||
|
.map(|e| e.file_name().into_string().unwrap())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let num_servers = hostnames.len();
|
||||||
|
if num_servers == 0 {
|
||||||
|
return Err(anyhow!("No VPN hostnames found in {:?}", ovpn_dir));
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!("Found {} VPN hostnames", num_servers)).await;
|
||||||
|
|
||||||
|
let mut container_names = Vec::with_capacity(num_servers);
|
||||||
|
let mut proxy_ports = Vec::with_capacity(num_servers);
|
||||||
|
let base_port: u16 = 10800;
|
||||||
|
|
||||||
|
// === STEP 1: Start ALL containers first ===
|
||||||
|
for (i, hostname) in hostnames.iter().enumerate() {
|
||||||
|
// Pick tcp443.ovpn if exists, else first .ovpn
|
||||||
|
let hostname_dir = ovpn_dir.join(hostname);
|
||||||
|
let mut ovpn_path: Option<PathBuf> = None;
|
||||||
|
for entry in WalkDir::new(&hostname_dir).max_depth(1) {
|
||||||
|
let entry = entry?;
|
||||||
|
if entry.path().extension().map_or(false, |ext| ext == "ovpn") {
|
||||||
|
if entry.file_name().to_str().unwrap_or("").contains("tcp443") {
|
||||||
|
ovpn_path = Some(entry.path().to_path_buf());
|
||||||
|
break;
|
||||||
|
} else if ovpn_path.is_none() {
|
||||||
|
ovpn_path = Some(entry.path().to_path_buf());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let ovpn_path = ovpn_path.ok_or_else(|| anyhow!("No .ovpn found for {}", hostname))?;
|
||||||
|
|
||||||
|
let name = format!("vpn-proxy-{}", i);
|
||||||
|
let port = base_port + i as u16 + 1;
|
||||||
|
|
||||||
|
// Clean up any existing container with the same name
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["rm", "-f", &name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Run Docker container
|
||||||
|
let status = Command::new("docker")
|
||||||
|
.args([
|
||||||
|
"run", "-d",
|
||||||
|
"--name", &name,
|
||||||
|
"--cap-add=NET_ADMIN",
|
||||||
|
"--device", "/dev/net/tun",
|
||||||
|
"--sysctl", "net.ipv4.ip_forward=1",
|
||||||
|
"-v", &format!("{}:/vpn/config.ovpn", ovpn_path.display()),
|
||||||
|
"-e", &format!("VPN_USERNAME={}", username),
|
||||||
|
"-e", &format!("VPN_PASSWORD={}", password),
|
||||||
|
"-p", &format!("{}:1080", port),
|
||||||
|
"rust-vpn-proxy",
|
||||||
|
])
|
||||||
|
.status()
|
||||||
|
.await
|
||||||
|
.context("Failed to run Docker")?;
|
||||||
|
|
||||||
|
if !status.success() {
|
||||||
|
return Err(anyhow!("Docker run failed for {}", name));
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!("Started container {} on port {} (waiting for VPN...)", name, port)).await;
|
||||||
|
|
||||||
|
container_names.push(name);
|
||||||
|
proxy_ports.push(port);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Brief pause to let containers start
|
||||||
|
sleep(Duration::from_secs(8)).await;
|
||||||
|
crate::util::logger::log_info(&format!("All {} containers started, beginning health checks...", container_names.len())).await;
|
||||||
|
|
||||||
|
// === STEP 2: Test ALL proxies in parallel with 10-second intervals ===
|
||||||
|
let results = Self::test_all_proxies_parallel(&container_names, &proxy_ports).await;
|
||||||
|
|
||||||
|
// Filter out failed containers
|
||||||
|
let mut working_containers = Vec::new();
|
||||||
|
let mut working_ports = Vec::new();
|
||||||
|
let mut failed_count = 0;
|
||||||
|
|
||||||
|
for (i, (container_name, port)) in container_names.into_iter().zip(proxy_ports.into_iter()).enumerate() {
|
||||||
|
match &results[i] {
|
||||||
|
Ok(Some(ip)) => {
|
||||||
|
crate::util::logger::log_info(&format!("✓ Container {} on port {} ready with IP: {}",
|
||||||
|
container_name, port, ip)).await;
|
||||||
|
working_containers.push(container_name);
|
||||||
|
working_ports.push(port);
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
crate::util::logger::log_warn(&format!("✓ Container {} on port {} ready but IP detection failed",
|
||||||
|
container_name, port)).await;
|
||||||
|
working_containers.push(container_name);
|
||||||
|
working_ports.push(port);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Get container logs to debug
|
||||||
|
let logs = Command::new("docker")
|
||||||
|
.args(["logs", "--tail", "20", &container_name])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
.and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into());
|
||||||
|
|
||||||
|
crate::util::logger::log_error(&format!("✗ Container {} on port {} failed: {}. Logs: {:?}",
|
||||||
|
container_name, port, e, logs)).await;
|
||||||
|
failed_count += 1;
|
||||||
|
// Clean up failed container
|
||||||
|
let _ = Self::cleanup_container(&container_name).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if working_containers.is_empty() {
|
||||||
|
return Err(anyhow!("All {} VPN proxy containers failed to start", num_servers));
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::util::logger::log_info(&format!("Started {}/{} VPN proxy containers successfully",
|
||||||
|
working_containers.len(), num_servers)).await;
|
||||||
|
|
||||||
|
if failed_count > 0 {
|
||||||
|
crate::util::logger::log_warn(&format!("{} containers failed and were cleaned up", failed_count)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
container_names: working_containers,
|
||||||
|
proxy_ports: working_ports,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test all proxies in parallel with 10-second intervals between tests
|
||||||
|
async fn test_all_proxies_parallel(container_names: &[String], proxy_ports: &[u16]) -> Vec<Result<Option<String>>> {
|
||||||
|
let mut tasks = Vec::new();
|
||||||
|
|
||||||
|
for (i, (container_name, port)) in container_names.iter().zip(proxy_ports.iter()).enumerate() {
|
||||||
|
let name = container_name.clone();
|
||||||
|
let port = *port;
|
||||||
|
|
||||||
|
tasks.push(tokio::spawn(async move {
|
||||||
|
// Try up to 6 times with 10-second intervals (total 60 seconds)
|
||||||
|
for attempt in 1..=6 {
|
||||||
|
crate::util::logger::log_info(&format!("Testing proxy {} (port {}) - Attempt {}/6",
|
||||||
|
name, port, attempt)).await;
|
||||||
|
|
||||||
|
match Self::test_single_proxy(port).await {
|
||||||
|
Ok(Some(ip)) => {
|
||||||
|
return Ok(Some(ip));
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
// Connection works but IP detection failed
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
Err(e) if attempt < 6 => {
|
||||||
|
crate::util::logger::log_info(&format!("Attempt {}/6 for {}: {} - retrying in 10s",
|
||||||
|
attempt, name, e)).await;
|
||||||
|
sleep(Duration::from_secs(10)).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
return Err(anyhow!("Failed after 6 attempts: {}", e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(anyhow!("Unexpected exit from retry loop"))
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all tasks to complete
|
||||||
|
join_all(tasks)
|
||||||
|
.await
|
||||||
|
.into_iter()
|
||||||
|
.map(|result| match result {
|
||||||
|
Ok(inner) => inner,
|
||||||
|
Err(e) => Err(anyhow!("Task panicked: {}", e)),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test a single proxy connection
|
||||||
|
async fn test_single_proxy(port: u16) -> Result<Option<String>> {
|
||||||
|
use std::io::{Read, Write};
|
||||||
|
use std::net::TcpStream;
|
||||||
|
use std::time::Duration as StdDuration;
|
||||||
|
|
||||||
|
// First, test SOCKS5 handshake directly
|
||||||
|
crate::util::logger::log_info(&format!("Testing SOCKS5 handshake on port {}...", port)).await;
|
||||||
|
|
||||||
|
// Use spawn_blocking for synchronous I/O
|
||||||
|
let test_result = tokio::task::spawn_blocking(move || {
|
||||||
|
// Connect to SOCKS5 proxy
|
||||||
|
let mut stream = match TcpStream::connect_timeout(
|
||||||
|
&format!("127.0.0.1:{}", port).parse().unwrap(),
|
||||||
|
StdDuration::from_secs(5)
|
||||||
|
) {
|
||||||
|
Ok(stream) => stream,
|
||||||
|
Err(e) => return Err(anyhow!("Failed to connect: {}", e)),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send SOCKS5 greeting: version 5, 1 method (no auth)
|
||||||
|
let greeting: [u8; 3] = [0x05, 0x01, 0x00]; // SOCKS5, 1 method, no auth
|
||||||
|
if let Err(e) = stream.write_all(&greeting) {
|
||||||
|
return Err(anyhow!("Failed to send greeting: {}", e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read response
|
||||||
|
let mut response = [0u8; 2];
|
||||||
|
if let Err(e) = stream.read_exact(&mut response) {
|
||||||
|
return Err(anyhow!("Failed to read response: {}", e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check response: should be [0x05, 0x00] for no auth required
|
||||||
|
if response[0] != 0x05 || response[1] != 0x00 {
|
||||||
|
return Err(anyhow!("Unexpected SOCKS5 response: {:?}", response));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}).await;
|
||||||
|
|
||||||
|
match test_result {
|
||||||
|
Ok(Ok(())) => {
|
||||||
|
crate::util::logger::log_info(&format!("✓ SOCKS5 proxy on port {} accepts connections", port)).await;
|
||||||
|
|
||||||
|
// Try to get IP through proxy using curl (fallback method)
|
||||||
|
let curl_result = tokio::process::Command::new("curl")
|
||||||
|
.args([
|
||||||
|
"-s",
|
||||||
|
"--socks5", &format!("localhost:{}", port),
|
||||||
|
"--max-time", "10",
|
||||||
|
"https://checkip.amazonaws.com"
|
||||||
|
])
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match curl_result {
|
||||||
|
Ok(output) if output.status.success() => {
|
||||||
|
let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||||
|
if Self::is_valid_ip(&ip) {
|
||||||
|
crate::util::logger::log_info(&format!("✓ Got IP via proxy: {}", ip)).await;
|
||||||
|
return Ok(Some(ip));
|
||||||
|
} else {
|
||||||
|
crate::util::logger::log_info(&format!("✓ Proxy works, invalid IP format: {}", ip)).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Proxy accepts connections but curl failed - still acceptable
|
||||||
|
crate::util::logger::log_info(&format!("✓ Proxy accepts connections (curl test failed)")).await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
return Err(anyhow!("SOCKS5 test failed: {}", e));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
return Err(anyhow!("Task failed: {}", e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clean up a failed container
|
||||||
|
async fn cleanup_container(container_name: &str) -> Result<()> {
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["stop", container_name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["rm", container_name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_valid_ip(ip: &str) -> bool {
|
||||||
|
let parts: Vec<&str> = ip.split('.').collect();
|
||||||
|
if parts.len() != 4 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for part in parts {
|
||||||
|
if let Ok(num) = part.parse::<u8>() {
|
||||||
|
if part != num.to_string() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test if a specific proxy is working
|
||||||
|
pub async fn test_proxy_connection(&self, index: usize) -> Result<String> {
|
||||||
|
let port = self.proxy_ports[index];
|
||||||
|
let proxy_url = format!("socks5://localhost:{}", port);
|
||||||
|
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.proxy(reqwest::Proxy::all(&proxy_url)?)
|
||||||
|
.timeout(Duration::from_secs(10))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let response = client.get("http://checkip.amazonaws.com")
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.text()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(response.trim().to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_proxy_url(&self, index: usize) -> String {
|
||||||
|
let port = self.proxy_ports[index % self.proxy_ports.len()];
|
||||||
|
format!("socks5://localhost:{}", port)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn num_proxies(&self) -> usize {
|
||||||
|
self.proxy_ports.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn shutdown(&self) -> Result<()> {
|
||||||
|
crate::util::logger::log_info(&format!("Shutting down {} Docker proxy containers...",
|
||||||
|
self.container_names.len())).await;
|
||||||
|
|
||||||
|
for name in &self.container_names {
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["stop", name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
let _ = Command::new("docker")
|
||||||
|
.args(["rm", name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn cleanup_all_proxy_containers() -> Result<()> {
|
||||||
|
// Step 1: List all container IDs that match our pattern
|
||||||
|
let output = Command::new("docker")
|
||||||
|
.args(["ps", "-a", "--format", "{{.ID}} {{.Names}} {{.Image}}"])
|
||||||
|
.output()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
|
let mut containers_to_kill = Vec::new();
|
||||||
|
|
||||||
|
for line in stdout.lines() {
|
||||||
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
let name_or_id = parts[0];
|
||||||
|
let name = parts[1];
|
||||||
|
let image = if parts.len() >= 3 { parts[2] } else { "" };
|
||||||
|
|
||||||
|
// Match by name prefix OR by image name
|
||||||
|
if name.starts_with("vpn-proxy-") || image.contains("rust-vpn-proxy") {
|
||||||
|
containers_to_kill.push(name_or_id.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if containers_to_kill.is_empty() {
|
||||||
|
crate::util::logger::log_info("No old rust-vpn-proxy containers found").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Kill and remove them all at once
|
||||||
|
let status = Command::new("docker")
|
||||||
|
.arg("rm")
|
||||||
|
.arg("-f")
|
||||||
|
.args(&containers_to_kill)
|
||||||
|
.status()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if status.success() {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Successfully removed {} old rust-vpn-proxy container(s)",
|
||||||
|
containers_to_kill.len()
|
||||||
|
))
|
||||||
|
.await;
|
||||||
|
} else {
|
||||||
|
crate::util::logger::log_warn("Some containers may still remain (non-critical)").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -1 +1,2 @@
|
|||||||
pub mod webdriver;
|
pub mod webdriver;
|
||||||
|
pub mod docker_vpn_proxy;
|
||||||
@@ -8,211 +8,264 @@ use std::process::Stdio;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||||
use tokio::process::{Child, Command};
|
use tokio::process::{Child, Command};
|
||||||
|
use tokio::task::JoinHandle;
|
||||||
use tokio::sync::{Mutex, Semaphore};
|
use tokio::sync::{Mutex, Semaphore};
|
||||||
use tokio::time::{sleep, timeout, Duration};
|
use tokio::time::{sleep, timeout, Duration};
|
||||||
|
use crate::scraper::docker_vpn_proxy::{DockerVpnProxyPool};
|
||||||
|
|
||||||
/// Manages a pool of ChromeDriver instances for parallel scraping.
|
/// Manages a pool of ChromeDriver instances for parallel scraping with optional VPN binding.
|
||||||
///
|
|
||||||
/// This struct maintains multiple ChromeDriver processes and allows controlled
|
|
||||||
/// concurrent access via a semaphore. Instances are reused across tasks to avoid
|
|
||||||
/// the overhead of spawning new processes.
|
|
||||||
pub struct ChromeDriverPool {
|
pub struct ChromeDriverPool {
|
||||||
instances: Vec<Arc<Mutex<ChromeInstance>>>,
|
instances: Vec<Arc<Mutex<ChromeInstance>>>,
|
||||||
semaphore: Arc<Semaphore>,
|
semaphore: Arc<Semaphore>,
|
||||||
tasks_per_instance: usize,
|
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeDriverPool {
|
impl ChromeDriverPool {
|
||||||
/// Creates a new pool with the specified number of ChromeDriver instances.
|
/// Creates a new pool without any proxy (direct connection).
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `pool_size` - Number of concurrent ChromeDriver instances to maintain
|
|
||||||
pub async fn new(pool_size: usize) -> Result<Self> {
|
pub async fn new(pool_size: usize) -> Result<Self> {
|
||||||
|
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new pool with task-per-instance limit but no proxy.
|
||||||
|
pub async fn new_with_task_limit(pool_size: usize, max_tasks_per_instance: usize) -> Result<Self> {
|
||||||
|
Self::new_with_proxy_and_task_limit(pool_size, None, max_tasks_per_instance).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new pool where each Chrome instance uses a different SOCKS5 proxy from the Docker pool.
|
||||||
|
pub async fn new_with_proxy(
|
||||||
|
pool_size: usize,
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
|
) -> Result<Self> {
|
||||||
|
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Full constructor: supports proxy + task limiting.
|
||||||
|
pub async fn new_with_proxy_and_task_limit(
|
||||||
|
pool_size: usize,
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
|
max_tasks_per_instance: usize,
|
||||||
|
) -> Result<Self> {
|
||||||
let mut instances = Vec::with_capacity(pool_size);
|
let mut instances = Vec::with_capacity(pool_size);
|
||||||
|
|
||||||
println!(
|
crate::util::logger::log_info(&format!(
|
||||||
"Initializing ChromeDriver pool with {} instances...",
|
"Initializing ChromeDriver pool with {} instances{}...",
|
||||||
pool_size
|
pool_size,
|
||||||
);
|
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" }
|
||||||
|
))
|
||||||
|
.await;
|
||||||
|
|
||||||
for i in 0..pool_size {
|
for i in 0..pool_size {
|
||||||
match ChromeInstance::new().await {
|
let proxy_url = proxy_pool
|
||||||
Ok(instance) => {
|
.as_ref()
|
||||||
println!(" ✓ Instance {} ready", i + 1);
|
.map(|pp| pp.get_proxy_url(i));
|
||||||
instances.push(Arc::new(Mutex::new(instance)));
|
|
||||||
}
|
let instance = ChromeInstance::new(proxy_url, max_tasks_per_instance).await?;
|
||||||
Err(e) => {
|
|
||||||
eprintln!(" ✗ Failed to create instance {}: {}", i + 1, e);
|
crate::util::logger::log_info(&format!(" Instance {} ready", i + 1)).await;
|
||||||
// Clean up already created instances
|
instances.push(Arc::new(Mutex::new(instance)));
|
||||||
drop(instances);
|
|
||||||
return Err(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
instances,
|
instances,
|
||||||
semaphore: Arc::new(Semaphore::new(pool_size)),
|
semaphore: Arc::new(Semaphore::new(pool_size)),
|
||||||
tasks_per_instance: 0,
|
proxy_pool,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Executes a scrape task using an available instance from the pool.
|
/// Execute a scraping task using an available instance from the pool.
|
||||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||||
where
|
where
|
||||||
T: Send + 'static,
|
T: Send + 'static,
|
||||||
F: FnOnce(Client) -> Fut + Send + 'static,
|
F: FnOnce(Client) -> Fut + Send + 'static,
|
||||||
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
Fut: std::future::Future<Output = Result<T>> + Send,
|
||||||
{
|
{
|
||||||
// Acquire semaphore permit
|
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
||||||
let _permit = self
|
|
||||||
.semaphore
|
|
||||||
.acquire()
|
|
||||||
.await
|
|
||||||
.map_err(|_| anyhow!("Semaphore closed"))?;
|
|
||||||
|
|
||||||
// Find an available instance (round-robin or first available)
|
// Round-robin selection
|
||||||
let instance = self.instances[0].clone(); // Simple: use first, could be round-robin
|
let index = rand::random_range(..self.instances.len());
|
||||||
|
let instance = self.instances[index].clone();
|
||||||
let mut guard = instance.lock().await;
|
let mut guard = instance.lock().await;
|
||||||
|
|
||||||
// Create a new session for this task
|
guard.increment_task_count();
|
||||||
|
|
||||||
|
if guard.max_tasks_per_instance > 0 {
|
||||||
|
crate::util::logger::log_info(&format!(
|
||||||
|
"Instance task count: {}/{}",
|
||||||
|
guard.get_task_count(),
|
||||||
|
guard.max_tasks_per_instance
|
||||||
|
))
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
let client = guard.new_session().await?;
|
let client = guard.new_session().await?;
|
||||||
|
|
||||||
// Release lock while we do the actual scraping
|
drop(guard); // release lock early
|
||||||
drop(guard);
|
|
||||||
|
|
||||||
// Navigate and parse
|
crate::util::logger::log_info(&format!("Scraping {} ...", url)).await;
|
||||||
client.goto(&url).await.context("Failed to navigate")?;
|
client.goto(&url).await.context("Navigation failed")?;
|
||||||
let result = timeout(Duration::from_secs(60), parse(client))
|
|
||||||
|
let result = timeout(Duration::from_secs(90), parse(client))
|
||||||
.await
|
.await
|
||||||
.context("Parse function timed out after 60s")??;
|
.context("Parse timeout")??;
|
||||||
|
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
|
||||||
|
pub async fn shutdown(&self) -> Result<()> {
|
||||||
|
for inst in &self.instances {
|
||||||
|
let mut guard = inst.lock().await;
|
||||||
|
guard.shutdown().await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(pp) = &self.proxy_pool {
|
||||||
|
pp.shutdown().await?;
|
||||||
|
crate::util::logger::log_info("All Docker VPN proxy containers stopped").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_number_of_instances(&self) -> usize {
|
pub fn get_number_of_instances(&self) -> usize {
|
||||||
self.instances.len()
|
self.instances.len()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents a single instance of chromedriver process.
|
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
|
||||||
pub struct ChromeInstance {
|
pub struct ChromeInstance {
|
||||||
process: Child,
|
|
||||||
base_url: String,
|
base_url: String,
|
||||||
|
process: Child,
|
||||||
|
stderr_log: Option<JoinHandle<()>>,
|
||||||
|
task_count: usize,
|
||||||
|
max_tasks_per_instance: usize,
|
||||||
|
proxy_url: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeInstance {
|
impl ChromeInstance {
|
||||||
/// Creates a new ChromeInstance by spawning chromedriver with random port.
|
pub async fn new(proxy_url: Option<String>, max_tasks_per_instance: usize) -> Result<Self> {
|
||||||
///
|
let (base_url, process, stderr_handle) = Self::spawn_chromedriver().await?;
|
||||||
/// This spawns `chromedriver --port=0` to avoid port conflicts, reads stdout to extract
|
|
||||||
/// the listening address, and waits for the success message. If timeout occurs or
|
Ok(Self {
|
||||||
/// spawning fails, returns an error with context.
|
base_url,
|
||||||
///
|
process,
|
||||||
/// # Errors
|
stderr_log: Some(stderr_handle),
|
||||||
///
|
task_count: 0,
|
||||||
/// Returns an error if chromedriver fails to spawn (e.g., not in PATH, version mismatch),
|
max_tasks_per_instance,
|
||||||
/// if the process exits early, or if the address/success message isn't found within 30s.
|
proxy_url,
|
||||||
pub async fn new() -> Result<Self> {
|
})
|
||||||
let mut command = Command::new("chromedriver-win64/chromedriver.exe");
|
}
|
||||||
command
|
|
||||||
.arg("--port=0") // Use random available port to support pooling
|
pub async fn new_session(&self) -> Result<Client> {
|
||||||
|
ClientBuilder::native()
|
||||||
|
.capabilities(self.chrome_args())
|
||||||
|
.connect(&self.base_url)
|
||||||
|
.await
|
||||||
|
.context("Failed to connect to ChromeDriver")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn increment_task_count(&mut self) {
|
||||||
|
self.task_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_task_count(&self) -> usize {
|
||||||
|
self.task_count
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn shutdown(&mut self) -> Result<()> {
|
||||||
|
if let Some(handle) = self.stderr_log.take() {
|
||||||
|
handle.abort();
|
||||||
|
let _ = handle.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ = self.process.start_kill();
|
||||||
|
let _ = self.process.wait().await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawns the actual `chromedriver` binary and waits for it to become ready.
|
||||||
|
async fn spawn_chromedriver() -> Result<(String, Child, JoinHandle<()>)> {
|
||||||
|
let mut process = Command::new("chromedriver-win64/chromedriver.exe")
|
||||||
|
.arg("--port=0") // let OS choose free port
|
||||||
.stdout(Stdio::piped())
|
.stdout(Stdio::piped())
|
||||||
.stderr(Stdio::piped());
|
.stderr(Stdio::piped())
|
||||||
|
|
||||||
let mut process = command
|
|
||||||
.spawn()
|
.spawn()
|
||||||
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
|
.context("Failed to start chromedriver. Is it in PATH?")?;
|
||||||
|
|
||||||
let mut stdout =
|
let stdout = process.stdout.take().unwrap();
|
||||||
BufReader::new(process.stdout.take().context("Failed to capture stdout")?).lines();
|
let stderr = process.stderr.take().unwrap();
|
||||||
|
|
||||||
let mut stderr =
|
let stdout_reader = BufReader::new(stdout);
|
||||||
BufReader::new(process.stderr.take().context("Failed to capture stderr")?).lines();
|
let mut stdout_lines = stdout_reader.lines();
|
||||||
|
|
||||||
let start_time = std::time::Instant::now();
|
let stderr_reader = BufReader::new(stderr);
|
||||||
let mut address: Option<String> = None;
|
let stderr_handle = tokio::spawn(async move {
|
||||||
let mut success = false;
|
let mut lines = stderr_reader.lines();
|
||||||
|
while let Ok(Some(line)) = lines.next_line().await {
|
||||||
// Log stderr in background for debugging
|
let t = line.trim();
|
||||||
tokio::spawn(async move {
|
if !t.is_empty() {
|
||||||
while let Ok(Some(line)) = stderr.next_line().await {
|
let _ = crate::util::logger::log_info(&format!("ChromeDriver: {}", t)).await;
|
||||||
eprintln!("ChromeDriver stderr: {}", line);
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Wait for address and success (up to 30s)
|
let start = tokio::time::Instant::now();
|
||||||
while start_time.elapsed() < Duration::from_secs(30) {
|
let mut address: Option<String> = None;
|
||||||
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout.next_line()).await {
|
|
||||||
|
while start.elapsed() < Duration::from_secs(30) {
|
||||||
|
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout_lines.next_line()).await {
|
||||||
if let Some(addr) = parse_chromedriver_address(&line) {
|
if let Some(addr) = parse_chromedriver_address(&line) {
|
||||||
address = Some(addr.to_string());
|
address = Some(addr);
|
||||||
}
|
}
|
||||||
|
if line.contains("ChromeDriver was started successfully") && address.is_some() {
|
||||||
if line.contains("ChromeDriver was started successfully") {
|
return Ok((address.unwrap(), process, stderr_handle));
|
||||||
success = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let (Some(addr), true) = (&address, success) {
|
|
||||||
return Ok(Self {
|
|
||||||
process,
|
|
||||||
base_url: addr.clone(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sleep(Duration::from_millis(100)).await;
|
sleep(Duration::from_millis(100)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cleanup on failure
|
|
||||||
let _ = process.kill().await;
|
let _ = process.kill().await;
|
||||||
Err(anyhow!("Timeout: ChromeDriver did not start within 30 seconds. Check version match with Chrome browser and system resources."))
|
stderr_handle.abort();
|
||||||
|
Err(anyhow!("ChromeDriver failed to start within 30s"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new browser session (client) from this ChromeDriver instance.
|
fn chrome_args(&self) -> Map<String, Value> {
|
||||||
/// Each session is independent and can be closed without affecting the driver.
|
let mut args = vec![
|
||||||
pub async fn new_session(&self) -> Result<Client> {
|
"--headless=new".to_string(),
|
||||||
ClientBuilder::native()
|
"--disable-gpu".to_string(),
|
||||||
.capabilities(Self::chrome_args())
|
"--no-sandbox".to_string(),
|
||||||
.connect(&self.base_url)
|
"--disable-dev-shm-usage".to_string(),
|
||||||
.await
|
"--disable-infobars".to_string(),
|
||||||
.context("Failed to create new session")
|
"--disable-extensions".to_string(),
|
||||||
}
|
"--disable-popup-blocking".to_string(),
|
||||||
|
"--disable-notifications".to_string(),
|
||||||
fn chrome_args() -> Map<String, Value> {
|
"--disable-logging".to_string(),
|
||||||
let args = serde_json::json!({
|
"--disable-autofill".to_string(),
|
||||||
|
"--disable-sync".to_string(),
|
||||||
|
"--disable-default-apps".to_string(),
|
||||||
|
"--disable-translate".to_string(),
|
||||||
|
"--window-size=1920,1080".to_string(),
|
||||||
|
"--disable-blink-features=AutomationControlled".to_string(),
|
||||||
|
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36".to_string()
|
||||||
|
];
|
||||||
|
if let Some(ref proxy) = self.proxy_url {
|
||||||
|
let proxy = proxy.clone();
|
||||||
|
let proxy_formatted = format!("--proxy-server={}", proxy);
|
||||||
|
args.push(proxy_formatted);
|
||||||
|
}
|
||||||
|
let caps = serde_json::json!({
|
||||||
"goog:chromeOptions": {
|
"goog:chromeOptions": {
|
||||||
"args": [
|
"args": args,
|
||||||
"--headless=new",
|
|
||||||
"--disable-gpu",
|
|
||||||
"--no-sandbox",
|
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--disable-infobars",
|
|
||||||
"--disable-extensions",
|
|
||||||
"--disable-popup-blocking",
|
|
||||||
"--disable-notifications",
|
|
||||||
"--disable-logging",
|
|
||||||
"--disable-autofill",
|
|
||||||
"--disable-features=TranslateUI,OptimizationGuideModelDownloading",
|
|
||||||
"--window-size=1920,1080",
|
|
||||||
"--disable-blink-features=AutomationControlled",
|
|
||||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
||||||
],
|
|
||||||
"excludeSwitches": ["enable-logging", "enable-automation"],
|
"excludeSwitches": ["enable-logging", "enable-automation"],
|
||||||
"useAutomationExtension": false,
|
|
||||||
"prefs": {
|
"prefs": {
|
||||||
"profile.default_content_setting_values.notifications": 2
|
"profile.default_content_setting_values.notifications": 2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
args.as_object()
|
caps.as_object().cloned().unwrap()
|
||||||
.expect("Capabilities should be a JSON object")
|
|
||||||
.clone()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parses the ChromeDriver address from a log line.
|
|
||||||
///
|
|
||||||
/// Looks for the "Starting ChromeDriver ... on port XXXX" line and extracts the port.
|
|
||||||
/// Returns `Some("http://localhost:XXXX")` if found, else `None`.
|
|
||||||
fn parse_chromedriver_address(line: &str) -> Option<String> {
|
fn parse_chromedriver_address(line: &str) -> Option<String> {
|
||||||
if line.contains("Starting ChromeDriver") {
|
if line.contains("Starting ChromeDriver") {
|
||||||
if let Some(port_str) = line.split("on port ").nth(1) {
|
if let Some(port_str) = line.split("on port ").nth(1) {
|
||||||
@@ -223,7 +276,6 @@ fn parse_chromedriver_address(line: &str) -> Option<String> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Fallback for other formats (e.g., explicit port mentions)
|
|
||||||
for word in line.split_whitespace() {
|
for word in line.split_whitespace() {
|
||||||
if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::<u16>() {
|
if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::<u16>() {
|
||||||
if port > 1024 && port < 65535 && line.to_lowercase().contains("port") {
|
if port > 1024 && port < 65535 && line.to_lowercase().contains("port") {
|
||||||
@@ -236,14 +288,13 @@ fn parse_chromedriver_address(line: &str) -> Option<String> {
|
|||||||
|
|
||||||
impl Drop for ChromeInstance {
|
impl Drop for ChromeInstance {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
|
// Signal child to terminate. Do NOT block here; shutdown should be
|
||||||
|
// performed with the async `shutdown()` method when possible.
|
||||||
let _ = self.process.start_kill();
|
let _ = self.process.start_kill();
|
||||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Simplified task execution - now uses the pool pattern.
|
/// Simplified task execution - uses the pool pattern.
|
||||||
///
|
|
||||||
/// For backwards compatibility with existing code.
|
|
||||||
pub struct ScrapeTask<T> {
|
pub struct ScrapeTask<T> {
|
||||||
url: String,
|
url: String,
|
||||||
parse: Box<
|
parse: Box<
|
||||||
@@ -263,7 +314,6 @@ impl<T: Send + 'static> ScrapeTask<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Executes using a provided pool (more efficient for multiple tasks).
|
|
||||||
pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
|
pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
|
||||||
let url = self.url;
|
let url = self.url;
|
||||||
let parse = self.parse;
|
let parse = self.parse;
|
||||||
@@ -271,4 +321,4 @@ impl<T: Send + 'static> ScrapeTask<T> {
|
|||||||
pool.execute(url, move |client| async move { (parse)(client).await })
|
pool.execute(url, move |client| async move { (parse)(client).await })
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,8 +1,6 @@
|
|||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::fs;
|
use std::fs;
|
||||||
|
|
||||||
use crate::util::opnv;
|
|
||||||
|
|
||||||
/// Central configuration for all data paths
|
/// Central configuration for all data paths
|
||||||
pub struct DataPaths {
|
pub struct DataPaths {
|
||||||
base_dir: PathBuf,
|
base_dir: PathBuf,
|
||||||
|
|||||||
379
test/vpn_integration_tests.rs
Normal file
379
test/vpn_integration_tests.rs
Normal file
@@ -0,0 +1,379 @@
|
|||||||
|
// tests/vpn_integration_tests.rs
|
||||||
|
//! Integration tests for VPN rotation system
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod vpn_tests {
|
||||||
|
use event_backtest_engine::{
|
||||||
|
scraper::{
|
||||||
|
webdriver::ChromeDriverPool,
|
||||||
|
vpn_manager::{VpnInstance, VpnPool},
|
||||||
|
},
|
||||||
|
util::{directories::DataPaths, opnv},
|
||||||
|
};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
/// Helper to create a test VPN instance without connecting
|
||||||
|
fn create_test_vpn_instance() -> VpnInstance {
|
||||||
|
VpnInstance::new(
|
||||||
|
PathBuf::from("test.ovpn"),
|
||||||
|
"testuser".to_string(),
|
||||||
|
"testpass".to_string(),
|
||||||
|
)
|
||||||
|
.expect("Failed to create test VPN instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vpn_instance_creation() {
|
||||||
|
let vpn = create_test_vpn_instance();
|
||||||
|
assert_eq!(vpn.hostname(), "test");
|
||||||
|
assert!(!vpn.is_healthy());
|
||||||
|
assert!(vpn.external_ip().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vpn_task_counting() {
|
||||||
|
let mut vpn = create_test_vpn_instance();
|
||||||
|
|
||||||
|
// Should not rotate initially
|
||||||
|
assert!(!vpn.increment_task_count(10));
|
||||||
|
|
||||||
|
// Increment tasks
|
||||||
|
for i in 1..10 {
|
||||||
|
assert!(!vpn.increment_task_count(10), "Should not rotate at task {}", i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should rotate at threshold
|
||||||
|
assert!(vpn.increment_task_count(10), "Should rotate at task 10");
|
||||||
|
|
||||||
|
// Reset and verify
|
||||||
|
vpn.reset_task_count();
|
||||||
|
assert!(!vpn.increment_task_count(10), "Should not rotate after reset");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vpn_task_counting_zero_threshold() {
|
||||||
|
let mut vpn = create_test_vpn_instance();
|
||||||
|
|
||||||
|
// With threshold=0, should never auto-rotate
|
||||||
|
for _ in 0..100 {
|
||||||
|
assert!(!vpn.increment_task_count(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_chromedriver_pool_creation_no_vpn() {
|
||||||
|
let result = ChromeDriverPool::new(2).await;
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(pool) => {
|
||||||
|
assert_eq!(pool.get_number_of_instances(), 2);
|
||||||
|
assert!(!pool.is_vpn_enabled());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("ChromeDriver pool creation failed (expected if chromedriver not installed): {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_data_paths_creation() {
|
||||||
|
let paths = DataPaths::new("./test_data").expect("Failed to create paths");
|
||||||
|
|
||||||
|
assert!(paths.data_dir().exists());
|
||||||
|
assert!(paths.cache_dir().exists());
|
||||||
|
assert!(paths.logs_dir().exists());
|
||||||
|
assert!(paths.cache_openvpn_dir().exists());
|
||||||
|
|
||||||
|
// Cleanup
|
||||||
|
let _ = std::fs::remove_dir_all("./test_data");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore] // This test requires actual network access and VPNBook availability
|
||||||
|
async fn test_fetch_vpnbook_configs() {
|
||||||
|
let paths = DataPaths::new(".").expect("Failed to create paths");
|
||||||
|
|
||||||
|
// This test requires a ChromeDriver pool
|
||||||
|
let pool_result = ChromeDriverPool::new(1).await;
|
||||||
|
if pool_result.is_err() {
|
||||||
|
eprintln!("Skipping VPNBook fetch test: ChromeDriver not available");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let pool = Arc::new(pool_result.unwrap());
|
||||||
|
|
||||||
|
let result = opnv::fetch_vpnbook_configs(&pool, paths.cache_dir()).await;
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok((username, password, files)) => {
|
||||||
|
assert!(!username.is_empty(), "Username should not be empty");
|
||||||
|
assert!(!password.is_empty(), "Password should not be empty");
|
||||||
|
assert!(!files.is_empty(), "Should fetch at least one config file");
|
||||||
|
|
||||||
|
println!("Fetched {} VPN configs", files.len());
|
||||||
|
for file in &files {
|
||||||
|
assert!(file.exists(), "Config file should exist: {:?}", file);
|
||||||
|
assert_eq!(file.extension().and_then(|s| s.to_str()), Some("ovpn"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("VPNBook fetch failed (may be temporary): {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore] // Requires actual VPN configs and OpenVPN installation
|
||||||
|
async fn test_vpn_pool_creation() {
|
||||||
|
let paths = DataPaths::new(".").expect("Failed to create paths");
|
||||||
|
|
||||||
|
// First fetch configs
|
||||||
|
let pool_result = ChromeDriverPool::new(1).await;
|
||||||
|
if pool_result.is_err() {
|
||||||
|
eprintln!("Skipping VPN pool test: ChromeDriver not available");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let temp_pool = Arc::new(pool_result.unwrap());
|
||||||
|
let fetch_result = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await;
|
||||||
|
|
||||||
|
if fetch_result.is_err() {
|
||||||
|
eprintln!("Skipping VPN pool test: Could not fetch configs");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (username, password, _) = fetch_result.unwrap();
|
||||||
|
|
||||||
|
// Create VPN pool
|
||||||
|
let vpn_pool_result = VpnPool::new(
|
||||||
|
paths.cache_openvpn_dir(),
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
false,
|
||||||
|
0,
|
||||||
|
).await;
|
||||||
|
|
||||||
|
match vpn_pool_result {
|
||||||
|
Ok(vpn_pool) => {
|
||||||
|
assert!(vpn_pool.len() > 0, "VPN pool should have at least one instance");
|
||||||
|
println!("Created VPN pool with {} instances", vpn_pool.len());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("VPN pool creation failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore] // Full integration test - requires all components
|
||||||
|
async fn test_full_vpn_integration() {
|
||||||
|
let paths = DataPaths::new(".").expect("Failed to create paths");
|
||||||
|
|
||||||
|
// Step 1: Create temp ChromeDriver pool for fetching
|
||||||
|
let temp_pool = match ChromeDriverPool::new(1).await {
|
||||||
|
Ok(p) => Arc::new(p),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Skipping integration test: ChromeDriver not available - {}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 2: Fetch VPNBook configs
|
||||||
|
let (username, password, files) = match opnv::fetch_vpnbook_configs(
|
||||||
|
&temp_pool,
|
||||||
|
paths.cache_dir()
|
||||||
|
).await {
|
||||||
|
Ok(result) => result,
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Skipping integration test: Config fetch failed - {}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assert!(!files.is_empty(), "Should have fetched configs");
|
||||||
|
|
||||||
|
// Step 3: Create VPN pool
|
||||||
|
let vpn_pool = match VpnPool::new(
|
||||||
|
paths.cache_openvpn_dir(),
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
true,
|
||||||
|
5,
|
||||||
|
).await {
|
||||||
|
Ok(pool) => Arc::new(pool),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Skipping integration test: VPN pool creation failed - {}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 4: Connect one VPN
|
||||||
|
let vpn_instance = vpn_pool.acquire().await.expect("Failed to acquire VPN");
|
||||||
|
let connect_result = {
|
||||||
|
let mut vpn = vpn_instance.lock().await;
|
||||||
|
vpn.connect().await
|
||||||
|
};
|
||||||
|
|
||||||
|
match connect_result {
|
||||||
|
Ok(_) => {
|
||||||
|
let vpn = vpn_instance.lock().await;
|
||||||
|
println!("✓ VPN connected: {} ({})",
|
||||||
|
vpn.hostname(),
|
||||||
|
vpn.external_ip().unwrap_or("unknown")
|
||||||
|
);
|
||||||
|
assert!(vpn.is_healthy());
|
||||||
|
assert!(vpn.external_ip().is_some());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("VPN connection failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 5: Create ChromeDriver pool with VPN
|
||||||
|
let driver_pool_result = ChromeDriverPool::new_with_vpn(
|
||||||
|
1,
|
||||||
|
Some(vpn_pool.clone())
|
||||||
|
).await;
|
||||||
|
|
||||||
|
match driver_pool_result {
|
||||||
|
Ok(driver_pool) => {
|
||||||
|
assert!(driver_pool.is_vpn_enabled());
|
||||||
|
println!("✓ ChromeDriver pool created with VPN binding");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("ChromeDriver pool creation failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 6: Cleanup
|
||||||
|
vpn_pool.disconnect_all().await.expect("Failed to disconnect VPNs");
|
||||||
|
println!("✓ Integration test complete");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_hostname_extraction() {
|
||||||
|
// Test the hostname extraction logic
|
||||||
|
let test_cases = vec![
|
||||||
|
("test/ca149.vpnbook.com/config.ovpn", "ca149.vpnbook.com"),
|
||||||
|
("test/us1.vpnbook.com/config.ovpn", "us1.vpnbook.com"),
|
||||||
|
("test/de4.vpnbook.com/config.ovpn", "de4.vpnbook.com"),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (path, expected_hostname) in test_cases {
|
||||||
|
let pb = PathBuf::from(path);
|
||||||
|
let hostname = pb.parent()
|
||||||
|
.and_then(|p| p.file_name())
|
||||||
|
.and_then(|n| n.to_str())
|
||||||
|
.unwrap_or("unknown");
|
||||||
|
|
||||||
|
assert_eq!(hostname, expected_hostname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
#[test]
|
||||||
|
fn test_forcebindip_manager_creation() {
|
||||||
|
use event_backtest_engine::ForceBindIpManager;
|
||||||
|
|
||||||
|
match ForceBindIpManager::new() {
|
||||||
|
Ok(manager) => {
|
||||||
|
println!("✓ ForceBindIP found at: {:?}", manager.path());
|
||||||
|
assert!(manager.path().exists());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("ForceBindIP not found (expected in dev): {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
#[test]
|
||||||
|
fn test_forcebindip_command_creation() {
|
||||||
|
use event_backtest_engine::ForceBindIpManager;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
if let Ok(manager) = ForceBindIpManager::new() {
|
||||||
|
let cmd = manager.create_bound_command(
|
||||||
|
"192.168.1.100",
|
||||||
|
Path::new("test.exe"),
|
||||||
|
&["--arg1", "value1"],
|
||||||
|
);
|
||||||
|
|
||||||
|
let cmd_str = format!("{:?}", cmd);
|
||||||
|
assert!(cmd_str.contains("192.168.1.100"));
|
||||||
|
assert!(cmd_str.contains("test.exe"));
|
||||||
|
println!("✓ ForceBindIP command created successfully");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_config_defaults() {
|
||||||
|
use event_backtest_engine::Config;
|
||||||
|
|
||||||
|
let config = Config::default();
|
||||||
|
assert_eq!(config.economic_start_date, "2007-02-13");
|
||||||
|
assert_eq!(config.corporate_start_date, "2010-01-01");
|
||||||
|
assert_eq!(config.economic_lookahead_months, 3);
|
||||||
|
assert_eq!(config.max_parallel_instances, 10);
|
||||||
|
assert!(!config.enable_vpn_rotation);
|
||||||
|
assert_eq!(config.tasks_per_vpn_session, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod benchmark_tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore] // Performance test
|
||||||
|
async fn benchmark_vpn_rotation_overhead() {
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
// This test measures the overhead of VPN rotation
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
// Simulate rotation cycle
|
||||||
|
// 1. Disconnect (instant)
|
||||||
|
// 2. Wait 2 seconds
|
||||||
|
// 3. Connect (5-10 seconds)
|
||||||
|
// 4. Verify IP (1-2 seconds)
|
||||||
|
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
println!("Rotation cycle took: {:?}", elapsed);
|
||||||
|
|
||||||
|
// Typical rotation should complete in under 15 seconds
|
||||||
|
assert!(elapsed.as_secs() < 15);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore] // Performance test
|
||||||
|
async fn benchmark_parallel_scraping() {
|
||||||
|
// This test measures throughput with different parallelism levels
|
||||||
|
// Results help tune MAX_PARALLEL_INSTANCES
|
||||||
|
|
||||||
|
let configs = vec![1, 2, 3, 5, 10];
|
||||||
|
|
||||||
|
for &pool_size in &configs {
|
||||||
|
println!("Testing with {} parallel instances...", pool_size);
|
||||||
|
|
||||||
|
// Would need actual scraping implementation here
|
||||||
|
// For now, just verify pool creation time
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
|
let pool_result = event_backtest_engine::ChromeDriverPool::new(pool_size).await;
|
||||||
|
|
||||||
|
if let Ok(_pool) = pool_result {
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
println!(" Pool initialization: {:?}", elapsed);
|
||||||
|
|
||||||
|
// Pool creation should be fast (< 5 seconds per instance)
|
||||||
|
assert!(elapsed.as_secs() < pool_size as u64 * 5);
|
||||||
|
} else {
|
||||||
|
eprintln!(" Skipped - ChromeDriver not available");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user