Files
WebScraper/src/util/opnv.rs
2025-12-09 23:27:14 +01:00

281 lines
10 KiB
Rust

// src/scraper/opnv.rs
//! Module for fetching, downloading, and extracting OpenVPN configurations from VPNBook.
//!
//! This module provides functionality to scrape the VPNBook free VPN page using
//! a headless browser, handle potential consent popups, extract current credentials,
//! collect download URLs for OpenVPN ZIP files, download them, and then extract
//! the .ovpn files into a structured directory: cache/openvpn/<hostname>/<ovpn_filename>.
//! It is designed to fetch the most recent data on every run, as credentials and
//! server configurations change periodically.
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, Locator};
use reqwest;
use std::io::{Read};
use std::path::{Path, PathBuf};
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
use url::Url;
use zip::ZipArchive;
use crate::scraper::webdriver::{ChromeDriverPool, ScrapeTask};
use crate::util::{directories::DataPaths};
/// Fetches, downloads, and extracts the latest OpenVPN configurations from VPNBook.
///
/// This asynchronous function uses the provided `ChromeDriverPool` to scrape the
/// VPNBook free VPN page. It dismisses any consent popup if present, extracts the
/// current username and password, collects all OpenVPN ZIP download URLs, downloads
/// the ZIP files temporarily, extracts the .ovpn files into the specified directory
/// structure under `cache_dir`/openvpn/<hostname>/, and cleans up the ZIP files.
///
/// The directory structure is: cache/openvpn/<hostname>/<ovpn_filename>, where
/// <hostname> is derived from the ZIP filename (e.g., "ca149.vpnbook.com").
///
/// The function ensures fresh data is fetched each time it runs, making it suitable
/// for periodic updates where credentials may change.
///
/// # Arguments
///
/// * `pool` - A reference to the `ChromeDriverPool` for managing browser instances.
/// * `cache_dir` - The path to the base cache directory. The OpenVPN files will be saved
/// under `cache_dir`/openvpn/<hostname>/.
///
/// # Returns
///
/// A `Result` containing a tuple with:
/// - `String`: The scraped username.
/// - `String`: The scraped password.
/// - `Vec<PathBuf>`: Paths to the extracted .ovpn files.
///
/// # Errors
///
/// Returns an `anyhow::Error` if:
/// - Navigation to the page fails.
/// - The consent popup cannot be dismissed (if present).
/// - Credentials cannot be parsed from the page.
/// - Download URLs cannot be found or are invalid.
/// - HTTP downloads fail or file writing errors occur.
/// - ZIP extraction fails (e.g., invalid ZIP or I/O errors).
///
/// # Dependencies
///
/// This function requires the following crates (add to Cargo.toml if not present):
/// - `anyhow` for error handling.
/// - `fantoccini` for browser automation.
/// - `reqwest` (with `tokio` features) for HTTP downloads.
/// - `tokio` for asynchronous file operations.
/// - `url` for URL manipulation.
/// - `zip` for ZIP extraction.
///
/// # Examples
///
/// ```no_run
/// use anyhow::Result;
/// use event_backtest_engine::scraper::opnv::fetch_vpnbook_configs;
/// use event_backtest_engine::scraper::webdriver::ChromeDriverPool;
/// use std::path::Path;
///
/// #[tokio::main]
/// async fn main() -> Result<()> {
/// let pool = ChromeDriverPool::new(1).await?;
/// let (username, password, files) =
/// fetch_vpnbook_configs(&pool, Path::new("./cache")).await?;
/// println!("Username: {}, Password: {}", username, password);
/// for file in files {
/// println!("Extracted: {:?}", file);
/// }
/// Ok(())
/// }
/// ```
pub async fn fetch_vpnbook_configs(
pool: &ChromeDriverPool,
cache_dir: &Path,
) -> Result<(String, String, Vec<PathBuf>)> {
// Prepare the openvpn directory
let dir = DataPaths::new(".")?;
let vpn_dir = dir.cache_openvpn_dir();
tokio::fs::create_dir_all(&vpn_dir)
.await
.context("Failed to create openvpn directory")?;
// Temporary directory for ZIP downloads (under cache for consistency)
let temp_dir = cache_dir.join("temp_vpn_zips");
tokio::fs::create_dir_all(&temp_dir)
.await
.context("Failed to create temp directory")?;
let url = "https://www.vpnbook.com/freevpn".to_string();
// Define the scraping task
let task = ScrapeTask::new(url, |client: Client| async move {
// Attempt to dismiss consent popup if present
let consent_selector = r#"body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-do-not-consent.fc-secondary-button > p"#;
if let Ok(consent_elem) = client.find(Locator::Css(consent_selector)).await {
consent_elem
.click()
.await
.context("Failed to click consent dismissal button")?;
// Brief delay to allow popup to close
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
}
// Find all <code> elements
let codes = client
.find_all(Locator::Css("code"))
.await
.context("Failed to find code elements")?;
if codes.len() < 2 {
return Err(anyhow!("Insufficient code elements found for credentials"));
}
// The first <code> is username, second is password
let username = codes[0]
.text()
.await
.context("Failed to get username text")?;
let password = codes[1]
.text()
.await
.context("Failed to get password text")?;
// Locate all download links for OpenVPN ZIP files
let links = client
.find_all(Locator::Css(r#"a[href^="/free-openvpn-account/"][download=""]"#))
.await
.context("Failed to find download links")?;
// Collect relative hrefs
let mut rel_urls = Vec::new();
for link in links {
if let Some(href) = link.attr("href").await.context("Failed to get href attribute")? {
rel_urls.push(href);
}
}
Ok((username, password, rel_urls))
});
// Execute the scraping task using the pool
let (username, password, rel_urls) = task.execute_with_pool(pool).await?;
// Base URL for resolving relative paths
let base_url = Url::parse("https://www.vpnbook.com/")?;
// Download each ZIP file to temp_dir
let mut zip_paths = Vec::new();
for rel in &rel_urls {
let full_url = base_url.join(rel).context("Failed to join URL")?;
let filename = rel
.split('/')
.last()
.ok_or_else(|| anyhow!("Invalid filename in URL"))?
.to_string();
let out_path = temp_dir.join(&filename);
// Perform HTTP GET request
let resp = reqwest::get(full_url.clone())
.await
.with_context(|| format!("Failed to send download request for {}", full_url))?;
if resp.status().is_success() {
let bytes = resp
.bytes()
.await
.context("Failed to read response bytes")?;
// Write to file asynchronously
let mut file = File::create(&out_path)
.await
.context("Failed to create output file")?;
file.write_all(&bytes)
.await
.context("Failed to write to file")?;
zip_paths.push(out_path);
} else {
return Err(anyhow!(
"Download failed with status: {} for URL: {}",
resp.status(),
full_url
));
}
}
// Now extract .ovpn files from each ZIP
let mut extracted_paths = Vec::new();
for zip_path in zip_paths {
let hostname = get_hostname_from_zip_filename(
zip_path.file_name().unwrap().to_str().unwrap(),
);
let hostname_dir = vpn_dir.join(&hostname);
tokio::fs::create_dir_all(&hostname_dir)
.await
.context("Failed to create hostname directory")?;
// Use spawn_blocking for sync ZIP operations
let zip_path_clone = zip_path.clone();
let hostname_dir_clone = hostname_dir.clone();
let extract_result = tokio::task::spawn_blocking(move || {
let file = std::fs::File::open(&zip_path_clone)
.with_context(|| format!("Failed to open ZIP file: {:?}", zip_path_clone))?;
let mut archive = ZipArchive::new(file)
.with_context(|| format!("Failed to read ZIP archive: {:?}", zip_path_clone))?;
let mut paths = Vec::new();
for i in 0..archive.len() {
let mut zip_file = archive.by_index(i)?;
if zip_file.name().ends_with(".ovpn") {
// Get just the filename, stripping any path
let file_name = Path::new(zip_file.name()).file_name()
.ok_or_else(|| anyhow!("Invalid file name in ZIP: {}", zip_file.name()))?
.to_str()
.ok_or_else(|| anyhow!("Invalid UTF-8 in file name: {}", zip_file.name()))?
.to_string();
let target_path = hostname_dir_clone.join(file_name);
let mut content = Vec::new();
zip_file.read_to_end(&mut content)?;
std::fs::write(&target_path, &content)
.with_context(|| format!("Failed to write .ovpn file: {:?}", target_path))?;
paths.push(target_path);
}
}
Ok::<Vec<PathBuf>, anyhow::Error>(paths)
})
.await
.context("Spawn blocking failed")??;
extracted_paths.extend(extract_result);
// Clean up the ZIP file after extraction
tokio::fs::remove_file(&zip_path)
.await
.context("Failed to remove temp ZIP file")?;
}
// Optional: Clean up temp_dir if empty
let _ = tokio::fs::remove_dir(&temp_dir).await;
Ok((username, password, extracted_paths))
}
/// Derives the hostname from the ZIP filename.
///
/// For example, "vpnbook-openvpn-ca149.zip" -> "ca149.vpnbook.com"
///
/// If the format doesn't match, returns "unknown.vpnbook.com".
fn get_hostname_from_zip_filename(filename: &str) -> String {
if filename.starts_with("vpnbook-openvpn-") && filename.ends_with(".zip") {
let code = filename
.strip_prefix("vpnbook-openvpn-")
.unwrap()
.strip_suffix(".zip")
.unwrap();
format!("{}.vpnbook.com", code)
} else {
"unknown.vpnbook.com".to_string()
}
}