WebScraper/src/util/opnv.rs

// src/scraper/opnv.rs

//! Module for fetching, downloading, and extracting OpenVPN configurations from VPNBook.
//!
//! This module provides functionality to scrape the VPNBook free VPN page using
//! a headless browser, handle potential consent popups, extract current credentials,
//! collect download URLs for OpenVPN ZIP files, download them, and then extract
//! the .ovpn files into a structured directory: cache/openvpn/<hostname>/<ovpn_filename>.
//! It is designed to fetch the most recent data on every run, as credentials and
//! server configurations change periodically.

use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, Locator};
use reqwest;
use std::io::{Read};
use std::path::{Path, PathBuf};
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
use url::Url;
use zip::ZipArchive;
use crate::scraper::webdriver::{ChromeDriverPool, ScrapeTask};
use crate::util::{directories::DataPaths};

/// Fetches, downloads, and extracts the latest OpenVPN configurations from VPNBook.
///
/// This asynchronous function uses the provided `ChromeDriverPool` to scrape the
/// VPNBook free VPN page. It dismisses any consent popup if present, extracts the
/// current username and password, collects all OpenVPN ZIP download URLs, downloads
/// the ZIP files temporarily, extracts the .ovpn files into the specified directory
/// structure under `cache_dir`/openvpn/<hostname>/, and cleans up the ZIP files.
///
/// The directory structure is: cache/openvpn/<hostname>/<ovpn_filename>, where
/// <hostname> is derived from the ZIP filename (e.g., "ca149.vpnbook.com").
///
/// The function ensures fresh data is fetched each time it runs, making it suitable
/// for periodic updates where credentials may change.
///
/// # Arguments
///
/// * `pool` - A reference to the `ChromeDriverPool` for managing browser instances.
/// * `cache_dir` - The path to the base cache directory. The OpenVPN files will be saved
///   under `cache_dir`/openvpn/<hostname>/.
///
/// # Returns
///
/// A `Result` containing a tuple with:
/// - `String`: The scraped username.
/// - `String`: The scraped password.
/// - `Vec<PathBuf>`: Paths to the extracted .ovpn files.
///
/// # Errors
///
/// Returns an `anyhow::Error` if:
/// - Navigation to the page fails.
/// - The consent popup cannot be dismissed (if present).
/// - Credentials cannot be parsed from the page.
/// - Download URLs cannot be found or are invalid.
/// - HTTP downloads fail or file writing errors occur.
/// - ZIP extraction fails (e.g., invalid ZIP or I/O errors).
///
/// # Dependencies
///
/// This function requires the following crates (add to Cargo.toml if not present):
/// - `anyhow` for error handling.
/// - `fantoccini` for browser automation.
/// - `reqwest` (with `tokio` features) for HTTP downloads.
/// - `tokio` for asynchronous file operations.
/// - `url` for URL manipulation.
/// - `zip` for ZIP extraction.
///
/// # Examples
///
/// ```no_run
/// use anyhow::Result;
/// use event_backtest_engine::scraper::opnv::fetch_vpnbook_configs;
/// use event_backtest_engine::scraper::webdriver::ChromeDriverPool;
/// use std::path::Path;
///
/// #[tokio::main]
/// async fn main() -> Result<()> {
///     let pool = ChromeDriverPool::new(1).await?;
///     let (username, password, files) =
///         fetch_vpnbook_configs(&pool, Path::new("./cache")).await?;
///     println!("Username: {}, Password: {}", username, password);
///     for file in files {
///         println!("Extracted: {:?}", file);
///     }
///     Ok(())
/// }
/// ```
pub async fn fetch_vpnbook_configs(
    pool: &ChromeDriverPool,
    cache_dir: &Path,
) -> Result<(String, String, Vec<PathBuf>)> {
    // Prepare the openvpn directory
    let dir = DataPaths::new(".")?;
    let vpn_dir = dir.cache_openvpn_dir();
    tokio::fs::create_dir_all(&vpn_dir)
        .await
        .context("Failed to create openvpn directory")?;

    // Temporary directory for ZIP downloads (under cache for consistency)
    let temp_dir = cache_dir.join("temp_vpn_zips");
    tokio::fs::create_dir_all(&temp_dir)
        .await
        .context("Failed to create temp directory")?;

    let url = "https://www.vpnbook.com/freevpn".to_string();

    // Define the scraping task
    let task = ScrapeTask::new(url, |client: Client| async move {
        // Attempt to dismiss consent popup if present
        let consent_selector = r#"body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-do-not-consent.fc-secondary-button > p"#;
        if let Ok(consent_elem) = client.find(Locator::Css(consent_selector)).await {
            consent_elem
                .click()
                .await
                .context("Failed to click consent dismissal button")?;
            // Brief delay to allow popup to close
            tokio::time::sleep(std::time::Duration::from_secs(1)).await;
        }

        // Find all <code> elements
        let codes = client
            .find_all(Locator::Css("code"))
            .await
            .context("Failed to find code elements")?;

        if codes.len() < 2 {
            return Err(anyhow!("Insufficient code elements found for credentials"));
        }

        // The first <code> is username, second is password
        let username = codes[0]
            .text()
            .await
            .context("Failed to get username text")?;

        let password = codes[1]
            .text()
            .await
            .context("Failed to get password text")?;

        // Locate all download links for OpenVPN ZIP files
        let links = client
            .find_all(Locator::Css(r#"a[href^="/free-openvpn-account/"][download=""]"#))
            .await
            .context("Failed to find download links")?;

        // Collect relative hrefs
        let mut rel_urls = Vec::new();
        for link in links {
            if let Some(href) = link.attr("href").await.context("Failed to get href attribute")? {
                rel_urls.push(href);
            }
        }

        Ok((username, password, rel_urls))
    });

    // Execute the scraping task using the pool
    let (username, password, rel_urls) = task.execute_with_pool(pool).await?;

    // Base URL for resolving relative paths
    let base_url = Url::parse("https://www.vpnbook.com/")?;

    // Download each ZIP file to temp_dir
    let mut zip_paths = Vec::new();
    for rel in &rel_urls {
        let full_url = base_url.join(rel).context("Failed to join URL")?;
        let filename = rel
            .split('/')
            .last()
            .ok_or_else(|| anyhow!("Invalid filename in URL"))?
            .to_string();
        let out_path = temp_dir.join(&filename);

        // Perform HTTP GET request
        let resp = reqwest::get(full_url.clone())
            .await
            .with_context(|| format!("Failed to send download request for {}", full_url))?;

        if resp.status().is_success() {
            let bytes = resp
                .bytes()
                .await
                .context("Failed to read response bytes")?;

            // Write to file asynchronously
            let mut file = File::create(&out_path)
                .await
                .context("Failed to create output file")?;
            file.write_all(&bytes)
                .await
                .context("Failed to write to file")?;

            zip_paths.push(out_path);
        } else {
            return Err(anyhow!(
                "Download failed with status: {} for URL: {}",
                resp.status(),
                full_url
            ));
        }
    }

    // Now extract .ovpn files from each ZIP
    let mut extracted_paths = Vec::new();
    for zip_path in zip_paths {
        let hostname = get_hostname_from_zip_filename(
            zip_path.file_name().unwrap().to_str().unwrap(),
        );
        let hostname_dir = vpn_dir.join(&hostname);
        tokio::fs::create_dir_all(&hostname_dir)
            .await
            .context("Failed to create hostname directory")?;

        // Use spawn_blocking for sync ZIP operations
        let zip_path_clone = zip_path.clone();
        let hostname_dir_clone = hostname_dir.clone();
        let extract_result = tokio::task::spawn_blocking(move || {
            let file = std::fs::File::open(&zip_path_clone)
                .with_context(|| format!("Failed to open ZIP file: {:?}", zip_path_clone))?;
            let mut archive = ZipArchive::new(file)
                .with_context(|| format!("Failed to read ZIP archive: {:?}", zip_path_clone))?;

            let mut paths = Vec::new();
            for i in 0..archive.len() {
                let mut zip_file = archive.by_index(i)?;
                if zip_file.name().ends_with(".ovpn") {
                    // Get just the filename, stripping any path
                    let file_name = Path::new(zip_file.name()).file_name()
                        .ok_or_else(|| anyhow!("Invalid file name in ZIP: {}", zip_file.name()))?
                        .to_str()
                        .ok_or_else(|| anyhow!("Invalid UTF-8 in file name: {}", zip_file.name()))?
                        .to_string();
                    let target_path = hostname_dir_clone.join(file_name);
                    let mut content = Vec::new();
                    zip_file.read_to_end(&mut content)?;

                    std::fs::write(&target_path, &content)
                        .with_context(|| format!("Failed to write .ovpn file: {:?}", target_path))?;
                    paths.push(target_path);
                }
            }
            Ok::<Vec<PathBuf>, anyhow::Error>(paths)
        })
        .await
        .context("Spawn blocking failed")??;

        extracted_paths.extend(extract_result);

        // Clean up the ZIP file after extraction
        tokio::fs::remove_file(&zip_path)
            .await
            .context("Failed to remove temp ZIP file")?;
    }

    // Optional: Clean up temp_dir if empty
    let _ = tokio::fs::remove_dir(&temp_dir).await;

    Ok((username, password, extracted_paths))
}

/// Derives the hostname from the ZIP filename.
///
/// For example, "vpnbook-openvpn-ca149.zip" -> "ca149.vpnbook.com"
///
/// If the format doesn't match, returns "unknown.vpnbook.com".
fn get_hostname_from_zip_filename(filename: &str) -> String {
    if filename.starts_with("vpnbook-openvpn-") && filename.ends_with(".zip") {
        let code = filename
            .strip_prefix("vpnbook-openvpn-")
            .unwrap()
            .strip_suffix(".zip")
            .unwrap();
        format!("{}.vpnbook.com", code)
    } else {
        "unknown.vpnbook.com".to_string()
    }
}