added gettin opnv setup files

This commit is contained in:
2025-12-09 19:23:54 +01:00
parent f95e9e2427
commit c2408d9a56
13 changed files with 478 additions and 53 deletions

View File

@@ -12,21 +12,51 @@ pub struct Config {
pub economic_lookahead_months: u32, // default: 3
/// Maximum number of parallel scraping tasks (default: 10).
/// This limits concurrency to protect system load and prevent website spamming.
#[serde(default = "default_max_parallel")]
pub max_parallel_tasks: usize,
#[serde(default = "default_max_parallel_instances")]
pub max_parallel_instances: usize,
pub max_tasks_per_instance: usize,
/// VPN rotation configuration
/// If set to "true", enables automatic VPN rotation between sessions
#[serde(default)]
pub enable_vpn_rotation: bool,
/// Comma-separated list of VPN servers/country codes to rotate through.
/// Example: "US-Free#1,UK-Free#1,JP-Free#1" or "US,JP,DE"
/// If empty, VPN rotation is disabled.
#[serde(default)]
pub vpn_servers: String,
/// Number of tasks per session before rotating VPN
/// If set to 0, rotates VPN between economic and corporate phases
#[serde(default = "default_tasks_per_session")]
pub tasks_per_vpn_session: usize,
}
fn default_max_parallel() -> usize {
fn default_max_parallel_instances() -> usize {
10
}
fn default_tasks_per_session() -> usize {
0 // 0 = rotate between economic/corporate
}
fn default_protonvpn_extension_id() -> String {
"ghmbeldphafepmbegfdlkpapadhbakde".to_string()
}
impl Default for Config {
fn default() -> Self {
Self {
economic_start_date: "2007-02-13".to_string(),
corporate_start_date: "2010-01-01".to_string(),
economic_lookahead_months: 3,
max_parallel_tasks: default_max_parallel(),
max_parallel_instances: default_max_parallel_instances(),
max_tasks_per_instance: 0,
enable_vpn_rotation: false,
vpn_servers: String::new(),
tasks_per_vpn_session: default_tasks_per_session(),
}
}
}
@@ -59,19 +89,54 @@ impl Config {
.parse()
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
let max_parallel_tasks: usize = dotenvy::var("MAX_PARALLEL_TASKS")
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
.unwrap_or_else(|_| "10".to_string())
.parse()
.context("Failed to parse MAX_PARALLEL_TASKS as usize")?;
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
.unwrap_or_else(|_| "0".to_string())
.parse()
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
let enable_vpn_rotation = dotenvy::var("ENABLE_VPN_ROTATION")
.unwrap_or_else(|_| "false".to_string())
.parse::<bool>()
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
let vpn_servers = dotenvy::var("VPN_SERVERS")
.unwrap_or_else(|_| String::new());
let tasks_per_vpn_session: usize = dotenvy::var("TASKS_PER_VPN_SESSION")
.unwrap_or_else(|_| "0".to_string())
.parse()
.context("Failed to parse TASKS_PER_VPN_SESSION as usize")?;
Ok(Self {
economic_start_date,
corporate_start_date,
economic_lookahead_months,
max_parallel_tasks,
max_parallel_instances,
max_tasks_per_instance,
enable_vpn_rotation,
vpn_servers,
tasks_per_vpn_session,
})
}
/// Get the list of VPN servers configured for rotation
pub fn get_vpn_servers(&self) -> Vec<String> {
if self.vpn_servers.is_empty() {
Vec::new()
} else {
self.vpn_servers
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
}
pub fn target_end_date(&self) -> String {
let now = chrono::Local::now().naive_local().date();
let future = now + chrono::Duration::days(30 * self.economic_lookahead_months as i64);

View File

@@ -1,7 +1,7 @@
// src/corporate/scraper.rs
use super::{types::*, helpers::*, openfigi::*};
//use crate::corporate::openfigi::OpenFigiClient;
use crate::{webdriver::webdriver::*, util::directories::DataPaths, util::logger};
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
use fantoccini::{Client, Locator};
use scraper::{Html, Selector};
use chrono::{DateTime, Duration, NaiveDate, Utc};

View File

@@ -3,7 +3,7 @@ use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfi
use crate::config::Config;
use crate::util::directories::DataPaths;
use crate::util::logger;
use crate::webdriver::webdriver::ChromeDriverPool;
use crate::scraper::webdriver::ChromeDriverPool;
use chrono::Local;
use std::collections::{HashMap};

View File

@@ -1,6 +1,6 @@
// src/economic/update.rs
use super::{scraper::*, storage::*, helpers::*, types::*};
use crate::{config::Config, webdriver::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
use chrono::{Local};
use std::sync::Arc;

8
src/lib.rs Normal file
View File

@@ -0,0 +1,8 @@
// src/lib.rs
//! Event Backtest Engine - Core Library
//!
//! Exposes all public modules for use in examples and tests
pub mod config;
pub mod scraper;
pub mod util;

View File

@@ -1,20 +1,21 @@
// src/main.rs
mod economic;
mod corporate;
mod config;
mod webdriver;
mod corporate;
mod economic;
mod util;
mod scraper;
use anyhow::Result;
use config::Config;
use webdriver::webdriver::ChromeDriverPool;
use scraper::webdriver::ChromeDriverPool;
use util::directories::DataPaths;
use util::logger;
use util::{logger, opnv};
use std::sync::Arc;
/// The entry point of the application.
///
/// This function loads the configuration, initializes a shared ChromeDriver pool,
/// fetches the latest VPNBook OpenVPN configurations if VPN rotation is enabled,
/// and sequentially runs the full updates for corporate and economic data.
/// Sequential execution helps prevent resource exhaustion from concurrent
/// chromedriver instances and avoids spamming the target websites with too many requests.
@@ -22,8 +23,8 @@ use std::sync::Arc;
/// # Errors
///
/// Returns an error if configuration loading fails, pool initialization fails,
/// or if either update function encounters an issue (e.g., network errors,
/// scraping failures, or chromedriver spawn failures like "program not found").
/// VPN fetching fails (if enabled), or if either update function encounters an issue
/// (e.g., network errors, scraping failures, or chromedriver spawn failures like "program not found").
#[tokio::main]
async fn main() -> Result<()> {
let config = Config::load().map_err(|err| {
@@ -40,16 +41,29 @@ async fn main() -> Result<()> {
})?;
logger::log_info("=== Application started ===").await;
logger::log_info(&format!("Config: economic_start_date={}, corporate_start_date={}, lookahead_months={}, max_parallel_tasks={}",
config.economic_start_date, config.corporate_start_date, config.economic_lookahead_months, config.max_parallel_tasks)).await;
logger::log_info(&format!("Config: economic_start_date={}, corporate_start_date={}, lookahead_months={}, max_parallel_instances={}, enable_vpn_rotation={}",
config.economic_start_date, config.corporate_start_date, config.economic_lookahead_months, config.max_parallel_instances, config.enable_vpn_rotation)).await;
// Initialize the shared ChromeDriver pool once
let pool_size = config.max_parallel_tasks;
let pool_size = config.max_parallel_instances;
logger::log_info(&format!("Initializing ChromeDriver pool with size: {}", pool_size)).await;
let pool = Arc::new(ChromeDriverPool::new(pool_size).await?);
logger::log_info("✓ ChromeDriver pool initialized successfully").await;
// Fetch VPNBook configs if VPN rotation is enabled
if config.enable_vpn_rotation {
logger::log_info("--- Fetching latest VPNBook OpenVPN configurations ---").await;
let (username, password, files) =
util::opnv::fetch_vpnbook_configs(&pool, paths.cache_dir()).await?;
logger::log_info(&format!("Fetched VPN username: {}, password: {}", username, password)).await;
for file in &files {
logger::log_info(&format!("Extracted OVPN: {:?}", file)).await;
}
// Optionally, store username/password for rotation use (e.g., in a file or global state)
// For now, just log them; extend as needed for rotation integration
}
// Run economic update first, passing the shared pool
logger::log_info("--- Starting economic data update ---").await;
economic::run_full_update(&config, &pool).await?;

View File

@@ -3,34 +3,38 @@
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, ClientBuilder};
use serde_json::{Map, Value};
use std::pin::Pin;
use std::process::Stdio;
use std::sync::Arc;
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::process::{Child, Command};
use tokio::sync::{Mutex, Semaphore};
use tokio::time::{Duration, sleep, timeout};
use std::pin::Pin;
use tokio::time::{sleep, timeout, Duration};
/// Manages a pool of ChromeDriver instances for parallel scraping.
///
///
/// This struct maintains multiple ChromeDriver processes and allows controlled
/// concurrent access via a semaphore. Instances are reused across tasks to avoid
/// the overhead of spawning new processes.
pub struct ChromeDriverPool {
instances: Vec<Arc<Mutex<ChromeInstance>>>,
semaphore: Arc<Semaphore>,
tasks_per_instance: usize,
}
impl ChromeDriverPool {
/// Creates a new pool with the specified number of ChromeDriver instances.
///
///
/// # Arguments
/// * `pool_size` - Number of concurrent ChromeDriver instances to maintain
pub async fn new(pool_size: usize) -> Result<Self> {
let mut instances = Vec::with_capacity(pool_size);
println!("Initializing ChromeDriver pool with {} instances...", pool_size);
println!(
"Initializing ChromeDriver pool with {} instances...",
pool_size
);
for i in 0..pool_size {
match ChromeInstance::new().await {
Ok(instance) => {
@@ -45,10 +49,11 @@ impl ChromeDriverPool {
}
}
}
Ok(Self {
instances,
semaphore: Arc::new(Semaphore::new(pool_size)),
tasks_per_instance: 0,
})
}
@@ -60,7 +65,10 @@ impl ChromeDriverPool {
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
{
// Acquire semaphore permit
let _permit = self.semaphore.acquire().await
let _permit = self
.semaphore
.acquire()
.await
.map_err(|_| anyhow!("Semaphore closed"))?;
// Find an available instance (round-robin or first available)
@@ -69,7 +77,7 @@ impl ChromeDriverPool {
// Create a new session for this task
let client = guard.new_session().await?;
// Release lock while we do the actual scraping
drop(guard);
@@ -82,8 +90,8 @@ impl ChromeDriverPool {
Ok(result)
}
pub fn get_number_of_instances (&self) -> usize {
self.instances.len()
pub fn get_number_of_instances(&self) -> usize {
self.instances.len()
}
}
@@ -94,7 +102,7 @@ pub struct ChromeInstance {
}
impl ChromeInstance {
/// Creates a new ChromeInstance by spawning chromedriver with random port.
/// Creates a new ChromeInstance by spawning chromedriver with random port.
///
/// This spawns `chromedriver --port=0` to avoid port conflicts, reads stdout to extract
/// the listening address, and waits for the success message. If timeout occurs or
@@ -107,7 +115,7 @@ impl ChromeInstance {
pub async fn new() -> Result<Self> {
let mut command = Command::new("chromedriver-win64/chromedriver.exe");
command
.arg("--port=0") // Use random available port to support pooling
.arg("--port=0") // Use random available port to support pooling
.stdout(Stdio::piped())
.stderr(Stdio::piped());
@@ -115,13 +123,11 @@ impl ChromeInstance {
.spawn()
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
let mut stdout = BufReader::new(
process.stdout.take().context("Failed to capture stdout")?
).lines();
let mut stdout =
BufReader::new(process.stdout.take().context("Failed to capture stdout")?).lines();
let mut stderr = BufReader::new(
process.stderr.take().context("Failed to capture stderr")?
).lines();
let mut stderr =
BufReader::new(process.stderr.take().context("Failed to capture stderr")?).lines();
let start_time = std::time::Instant::now();
let mut address: Option<String> = None;
@@ -136,9 +142,7 @@ impl ChromeInstance {
// Wait for address and success (up to 30s)
while start_time.elapsed() < Duration::from_secs(30) {
if let Ok(Ok(Some(line))) =
timeout(Duration::from_secs(1), stdout.next_line()).await
{
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout.next_line()).await {
if let Some(addr) = parse_chromedriver_address(&line) {
address = Some(addr.to_string());
}
@@ -200,8 +204,8 @@ impl ChromeInstance {
}
});
args.as_object()
.expect("Capabilities should be a JSON object")
.clone()
.expect("Capabilities should be a JSON object")
.clone()
}
}
@@ -238,11 +242,13 @@ impl Drop for ChromeInstance {
}
/// Simplified task execution - now uses the pool pattern.
///
///
/// For backwards compatibility with existing code.
pub struct ScrapeTask<T> {
url: String,
parse: Box<dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send>,
parse: Box<
dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send,
>,
}
impl<T: Send + 'static> ScrapeTask<T> {
@@ -261,9 +267,8 @@ impl<T: Send + 'static> ScrapeTask<T> {
pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
let url = self.url;
let parse = self.parse;
pool.execute(url, move |client| async move {
(parse)(client).await
}).await
pool.execute(url, move |client| async move { (parse)(client).await })
.await
}
}
}

View File

@@ -1,3 +1,4 @@
// src/util/mod.rs
pub mod logger;
pub mod directories;
pub mod directories;
pub mod opnv;

278
src/util/opnv.rs Normal file
View File

@@ -0,0 +1,278 @@
// src/scraper/opnv.rs
//! Module for fetching, downloading, and extracting OpenVPN configurations from VPNBook.
//!
//! This module provides functionality to scrape the VPNBook free VPN page using
//! a headless browser, handle potential consent popups, extract current credentials,
//! collect download URLs for OpenVPN ZIP files, download them, and then extract
//! the .ovpn files into a structured directory: cache/openvpn/<hostname>/<ovpn_filename>.
//! It is designed to fetch the most recent data on every run, as credentials and
//! server configurations change periodically.
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, Locator};
use regex::Regex;
use reqwest;
use std::io::{Read};
use std::path::{Path, PathBuf};
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
use url::Url;
use zip::ZipArchive;
use crate::scraper::webdriver::{ChromeDriverPool, ScrapeTask};
/// Fetches, downloads, and extracts the latest OpenVPN configurations from VPNBook.
///
/// This asynchronous function uses the provided `ChromeDriverPool` to scrape the
/// VPNBook free VPN page. It dismisses any consent popup if present, extracts the
/// current username and password, collects all OpenVPN ZIP download URLs, downloads
/// the ZIP files temporarily, extracts the .ovpn files into the specified directory
/// structure under `cache_dir`/openvpn/<hostname>/, and cleans up the ZIP files.
///
/// The directory structure is: cache/openvpn/<hostname>/<ovpn_filename>, where
/// <hostname> is derived from the ZIP filename (e.g., "ca149.vpnbook.com").
///
/// The function ensures fresh data is fetched each time it runs, making it suitable
/// for periodic updates where credentials may change.
///
/// # Arguments
///
/// * `pool` - A reference to the `ChromeDriverPool` for managing browser instances.
/// * `cache_dir` - The path to the base cache directory. The OpenVPN files will be saved
/// under `cache_dir`/openvpn/<hostname>/.
///
/// # Returns
///
/// A `Result` containing a tuple with:
/// - `String`: The scraped username.
/// - `String`: The scraped password.
/// - `Vec<PathBuf>`: Paths to the extracted .ovpn files.
///
/// # Errors
///
/// Returns an `anyhow::Error` if:
/// - Navigation to the page fails.
/// - The consent popup cannot be dismissed (if present).
/// - Credentials cannot be parsed from the page.
/// - Download URLs cannot be found or are invalid.
/// - HTTP downloads fail or file writing errors occur.
/// - ZIP extraction fails (e.g., invalid ZIP or I/O errors).
///
/// # Dependencies
///
/// This function requires the following crates (add to Cargo.toml if not present):
/// - `anyhow` for error handling.
/// - `fantoccini` for browser automation.
/// - `regex` for parsing credentials from HTML.
/// - `reqwest` (with `tokio` features) for HTTP downloads.
/// - `tokio` for asynchronous file operations.
/// - `url` for URL manipulation.
/// - `zip` for ZIP extraction.
///
/// # Examples
///
/// ```no_run
/// use anyhow::Result;
/// use event_backtest_engine::scraper::opnv::fetch_vpnbook_configs;
/// use event_backtest_engine::scraper::webdriver::ChromeDriverPool;
/// use std::path::Path;
///
/// #[tokio::main]
/// async fn main() -> Result<()> {
/// let pool = ChromeDriverPool::new(1).await?;
/// let (username, password, files) =
/// fetch_vpnbook_configs(&pool, Path::new("./cache")).await?;
/// println!("Username: {}, Password: {}", username, password);
/// for file in files {
/// println!("Extracted: {:?}", file);
/// }
/// Ok(())
/// }
/// ```
pub async fn fetch_vpnbook_configs(
pool: &ChromeDriverPool,
cache_dir: &Path,
) -> Result<(String, String, Vec<PathBuf>)> {
// Prepare the openvpn directory
let vpn_dir = cache_dir.join("openvpn");
tokio::fs::create_dir_all(&vpn_dir)
.await
.context("Failed to create openvpn directory")?;
// Temporary directory for ZIP downloads (under cache for consistency)
let temp_dir = cache_dir.join("temp_vpn_zips");
tokio::fs::create_dir_all(&temp_dir)
.await
.context("Failed to create temp directory")?;
let url = "https://www.vpnbook.com/freevpn".to_string();
// Define the scraping task
let task = ScrapeTask::new(url, |client: Client| async move {
// Attempt to dismiss consent popup if present
let consent_selector = r#"body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-do-not-consent.fc-secondary-button > p"#;
if let Ok(consent_elem) = client.find(Locator::Css(consent_selector)).await {
consent_elem
.click()
.await
.context("Failed to click consent dismissal button")?;
// Brief delay to allow popup to close
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
}
// Get the full page source for parsing
let page_source = client
.source()
.await
.context("Failed to retrieve page source")?;
// Parse username and password using regex (assuming HTML structure like <strong>Username:</strong> value)
let user_re =
Regex::new(r"Username:\s*</strong>\s*(\w+)").context("Invalid regex for username")?;
let pass_re =
Regex::new(r"Password:\s*</strong>\s*(\w+)").context("Invalid regex for password")?;
let username = user_re
.captures(&page_source)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
.ok_or_else(|| anyhow!("Username not found in page source"))?;
let password = pass_re
.captures(&page_source)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
.ok_or_else(|| anyhow!("Password not found in page source"))?;
// Locate all download links for OpenVPN ZIP files
let links = client
.find_all(Locator::Css(r#"a[href^="/free-openvpn-account/"][download=""]"#))
.await
.context("Failed to find download links")?;
// Collect relative hrefs
let mut rel_urls = Vec::new();
for link in links {
if let Some(href) = link.attr("href").await.context("Failed to get href attribute")? {
rel_urls.push(href);
}
}
Ok::<(String, String, Vec<String>), anyhow::Error>((username, password, rel_urls))
});
// Execute the scraping task using the pool
let (username, password, rel_urls) = task.execute_with_pool(pool).await?;
// Base URL for resolving relative paths
let base_url = Url::parse("https://www.vpnbook.com/")?;
// Download each ZIP file to temp_dir
let mut zip_paths = Vec::new();
for rel in &rel_urls {
let full_url = base_url.join(rel).context("Failed to join URL")?;
let filename = rel
.split('/')
.last()
.ok_or_else(|| anyhow!("Invalid filename in URL"))?
.to_string();
let out_path = temp_dir.join(&filename);
// Perform HTTP GET request
let resp = reqwest::get(full_url.clone())
.await
.with_context(|| format!("Failed to send download request for {}", full_url))?;
if resp.status().is_success() {
let bytes = resp
.bytes()
.await
.context("Failed to read response bytes")?;
// Write to file asynchronously
let mut file = File::create(&out_path)
.await
.context("Failed to create output file")?;
file.write_all(&bytes)
.await
.context("Failed to write to file")?;
zip_paths.push(out_path);
} else {
return Err(anyhow!(
"Download failed with status: {} for URL: {}",
resp.status(),
full_url
));
}
}
// Now extract .ovpn files from each ZIP
let mut extracted_paths = Vec::new();
for zip_path in zip_paths {
let hostname = get_hostname_from_zip_filename(
zip_path.file_name().unwrap().to_str().unwrap(),
);
let hostname_dir = vpn_dir.join(&hostname);
tokio::fs::create_dir_all(&hostname_dir)
.await
.context("Failed to create hostname directory")?;
// Use spawn_blocking for sync ZIP operations
let zip_path_clone = zip_path.clone();
let hostname_dir_clone = hostname_dir.clone();
let extract_result = tokio::task::spawn_blocking(move || {
let file = std::fs::File::open(&zip_path_clone)
.with_context(|| format!("Failed to open ZIP file: {:?}", zip_path_clone))?;
let mut archive = ZipArchive::new(file)
.with_context(|| format!("Failed to read ZIP archive: {:?}", zip_path_clone))?;
let mut paths = Vec::new();
for i in 0..archive.len() {
let mut zip_file = archive.by_index(i)?;
if zip_file.name().ends_with(".ovpn") {
let target_path = hostname_dir_clone.join(zip_file.name());
let mut content = Vec::new();
zip_file.read_to_end(&mut content)?;
std::fs::write(&target_path, &content)
.with_context(|| format!("Failed to write .ovpn file: {:?}", target_path))?;
paths.push(target_path);
}
}
Ok::<Vec<PathBuf>, anyhow::Error>(paths)
})
.await
.context("Spawn blocking failed")??;
extracted_paths.extend(extract_result);
// Clean up the ZIP file after extraction
tokio::fs::remove_file(&zip_path)
.await
.context("Failed to remove temp ZIP file")?;
}
// Optional: Clean up temp_dir if empty
let _ = tokio::fs::remove_dir(&temp_dir).await;
Ok((username, password, extracted_paths))
}
/// Derives the hostname from the ZIP filename.
///
/// For example, "vpnbook-openvpn-ca149.zip" -> "ca149.vpnbook.com"
///
/// If the format doesn't match, returns "unknown.vpnbook.com".
fn get_hostname_from_zip_filename(filename: &str) -> String {
if filename.starts_with("vpnbook-openvpn-") && filename.ends_with(".zip") {
let code = filename
.strip_prefix("vpnbook-openvpn-")
.unwrap()
.strip_suffix(".zip")
.unwrap();
format!("{}.vpnbook.com", code)
} else {
"unknown.vpnbook.com".to_string()
}
}