Files
WebScraper/src/main.rs

197 lines
7.4 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// src/main.rs
use web_scraper::{*, scraper, economic, corporate};
use anyhow::Result;
use web_scraper::config::Config;
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
use scraper::webdriver::ChromeDriverPool;
use util::directories::DataPaths;
use util::{logger, opnv};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::process::Command;
#[tokio::main]
async fn main() -> Result<()> {
let output = if cfg!(target_os = "windows") {
Command::new("cmd")
.args(["/C", "docker desktop start"])
.output()
.expect("failed to execute process")
} else {
Command::new("sh")
.arg("-c")
.arg("echo hello")
.output()
.expect("failed to execute process")
};
let _start_docker_desktop = output.stdout;
cleanup_all_proxy_containers().await.ok();
let config = match Config::load() {
Ok(cfg) => cfg,
Err(_) => {
eprintln!("Using default configuration");
Config::default()
}
};
let paths = DataPaths::new(".")?;
// Initialize monitoring system
let config_snapshot = ConfigSnapshot {
max_parallel_instances: config.max_parallel_instances,
max_tasks_per_instance: config.max_tasks_per_instance,
enable_vpn_rotation: config.enable_vpn_rotation,
max_requests_per_session: config.max_requests_per_session,
min_request_interval_ms: config.min_request_interval_ms,
max_retry_attempts: config.max_retry_attempts,
};
let (monitoring_handle, _monitoring_task) = init_monitoring(
config_snapshot,
paths.logs_dir().to_path_buf(),
3030, // Dashboard port
).await?;
// Emit pool initialization event
monitoring_handle.emit(monitoring::MonitoringEvent::PoolInitialized {
pool_size: config.max_parallel_instances,
with_proxy: config.enable_vpn_rotation,
with_rotation: config.max_tasks_per_instance > 0,
});
logger::log_info("Monitoring dashboard available at http://localhost:3030").await;
logger::init_debug_logger(paths.logs_dir()).await.ok();
logger::log_info("=== Economic Webscraper Started ===").await;
logger::log_info(&format!(
"Config → parallel_instances: {}, task_limit: {} vpn_rotation: {} proxy_instances_per_certificate: {:?}",
config.max_parallel_instances,
config.max_tasks_per_instance,
config.enable_vpn_rotation,
config.proxy_instances_per_certificate
)).await;
let number_proxy_instances_per_certificate = config.proxy_instances_per_certificate.unwrap_or(1);
// Simple shutdown flag
let shutdown_flag = Arc::new(AtomicBool::new(false));
// === Step 1: Fetch VPNBook configs ===
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
logger::log_info("VPN Rotation Enabled Fetching latest VPNBook configs").await;
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(None, &config, Some(monitoring_handle.clone())).await?);
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
.filter(|e| e.as_ref().unwrap().path().is_dir())
.count();
if server_count == 0 {
logger::log_warn("No VPN servers found continuing without VPN").await;
None
} else {
logger::log_info(&format!("Found {} VPN servers starting Docker proxy containers", server_count)).await;
let pp = Arc::new(DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password, number_proxy_instances_per_certificate).await?);
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
for i in 0..pp.num_proxies() {
if let Some(proxy_info) = pp.get_proxy_info(i) {
monitoring_handle.emit(monitoring::MonitoringEvent::ProxyConnected {
container_name: proxy_info.container_name.clone(),
ip_address: proxy_info.ip_address.clone(),
port: proxy_info.port,
});
}
}
Some(pp)
}
} else {
logger::log_info("VPN rotation disabled using direct connection").await;
None
};
// === Step 2: Initialize ChromeDriver pool ===
let pool_size_limit = config.max_parallel_instances;
let task_limit = config.max_tasks_per_instance;
logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", pool_size_limit)).await;
let pool = Arc::new(
if task_limit > 0 {
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await?
} else {
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await?
}
);
logger::log_info(&format!("ChromeDriver pool ready with {} instances", pool_size_limit)).await;
// === Step 3: Ctrl+C handler ===
{
let shutdown_flag_clone = Arc::clone(&shutdown_flag);
let pool_clone = Arc::clone(&pool);
let proxy_clone = proxy_pool.clone();
tokio::spawn(async move {
tokio::signal::ctrl_c().await.ok();
logger::log_info("Ctrl+C received shutting down gracefully...").await;
// Set flag first
shutdown_flag_clone.store(true, Ordering::SeqCst);
// Wait a bit for tasks to notice
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// Cleanup
if let Err(e) = (&*pool_clone).shutdown().await {
logger::log_error(&format!("Error during pool shutdown: {}", e)).await;
}
if let Some(pp) = proxy_clone {
if let Err(e) = pp.shutdown().await {
logger::log_warn(&format!("Failed to stop Docker containers: {}", e)).await;
} else {
logger::log_info("All Docker VPN containers stopped").await;
}
}
let _ = cleanup_all_proxy_containers().await;
std::process::exit(0);
});
}
// === Step 4: Run scraping jobs ===
logger::log_info("--- Starting ECONOMIC data update ---").await;
//economic::run_full_update(&config, &pool).await?;
logger::log_info("Economic update completed").await;
if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("--- Starting CORPORATE data update ---").await;
corporate::run_full_update(&config, &pool, &shutdown_flag).await?;
logger::log_info("Corporate update completed").await;
}
// === Step 5: Final cleanup ===
if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Shutting down ChromeDriver pool...").await;
pool.shutdown().await?;
if let Some(pp) = proxy_pool {
logger::log_info("Stopping Docker VPN proxy containers...").await;
pp.shutdown().await?;
cleanup_all_proxy_containers().await.ok();
}
logger::log_info("=== Application finished successfully ===").await;
}
Ok(())
}