cleaned up main
This commit is contained in:
@@ -7,14 +7,12 @@
|
|||||||
use super::types::CompanyCrossPlatformInfo;
|
use super::types::CompanyCrossPlatformInfo;
|
||||||
use crate::util::logger;
|
use crate::util::logger;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path};
|
||||||
use std::sync::Arc;
|
use chrono::{DateTime, Duration, Utc};
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde::Serialize;
|
use tokio::fs::{File};
|
||||||
use tokio::fs::{File, OpenOptions};
|
|
||||||
use tokio::io::{AsyncWriteExt};
|
use tokio::io::{AsyncWriteExt};
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use tokio::sync::mpsc;
|
|
||||||
|
|
||||||
/// Load companies from checkpoint and replay log for recovery
|
/// Load companies from checkpoint and replay log for recovery
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ use crate::scraper::yahoo::{YahooClientPool};
|
|||||||
|
|
||||||
use std::result::Result::Ok;
|
use std::result::Result::Ok;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool};
|
||||||
|
|
||||||
/// Main corporate update entry point with shutdown awareness
|
/// Main corporate update entry point with shutdown awareness
|
||||||
pub async fn run_full_update(
|
pub async fn run_full_update(
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use crate::util::logger;
|
|||||||
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
|
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
|
||||||
|
|
||||||
use std::result::Result::Ok;
|
use std::result::Result::Ok;
|
||||||
use chrono::{Local, Utc};
|
use chrono::{Utc};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||||
@@ -191,6 +191,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
|||||||
let input_path = data_path.join("companies_yahoo.jsonl");
|
let input_path = data_path.join("companies_yahoo.jsonl");
|
||||||
let checkpoint_path = data_path.join("companies_yahoo_cleaned.jsonl");
|
let checkpoint_path = data_path.join("companies_yahoo_cleaned.jsonl");
|
||||||
let log_path = data_path.join("companies_updates.log");
|
let log_path = data_path.join("companies_updates.log");
|
||||||
|
let state_path = data_path.join("state.jsonl");
|
||||||
|
|
||||||
// Check input exists
|
// Check input exists
|
||||||
if !input_path.exists() {
|
if !input_path.exists() {
|
||||||
@@ -198,6 +199,35 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
|||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if state_path.exists() {
|
||||||
|
let state_content = tokio::fs::read_to_string(&state_path).await?;
|
||||||
|
|
||||||
|
for line in state_content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(state) = serde_json::from_str::<serde_json::Value>(line) {
|
||||||
|
if state.get("yahoo_companies_cleansed_low_profile").and_then(|v| v.as_bool()).unwrap_or(false) {
|
||||||
|
logger::log_info(" Yahoo low profile cleansing already completed, reading existing file...").await;
|
||||||
|
|
||||||
|
if checkpoint_path.exists() {
|
||||||
|
let checkpoint_content = tokio::fs::read_to_string(&checkpoint_path).await?;
|
||||||
|
let count = checkpoint_content.lines()
|
||||||
|
.filter(|line| !line.trim().is_empty())
|
||||||
|
.count();
|
||||||
|
|
||||||
|
logger::log_info(&format!(" ✓ Found {} companies in companies_yahoo_cleaned.jsonl", count)).await;
|
||||||
|
return Ok(count);
|
||||||
|
} else {
|
||||||
|
logger::log_warn(" State indicates completion but companies_yahoo_cleaned.jsonl not found, re-running...").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||||
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||||
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||||
@@ -616,7 +646,26 @@ pub async fn companies_yahoo_cleansed_low_profile(
|
|||||||
// Shutdown Yahoo pool
|
// Shutdown Yahoo pool
|
||||||
yahoo_pool.shutdown().await?;
|
yahoo_pool.shutdown().await?;
|
||||||
|
|
||||||
Ok(final_valid)
|
// Write completion milestone to state.jsonl
|
||||||
|
let state_path = data_path.join("state.jsonl");
|
||||||
|
let yahoo_low_profile = json!({
|
||||||
|
"yahoo_companies_cleansed_low_profile": true,
|
||||||
|
"completed_at": chrono::Utc::now().to_rfc3339(),
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut state_file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&state_path)
|
||||||
|
.await?;
|
||||||
|
let state_line = serde_json::to_string(&yahoo_low_profile)?;
|
||||||
|
state_file.write_all(state_line.as_bytes()).await?;
|
||||||
|
state_file.write_all(b"\n").await?;
|
||||||
|
state_file.flush().await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!(" ✓ State milestone saved to: {:?}", state_path)).await;
|
||||||
|
|
||||||
|
Ok(final_count)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper function to spawn a validation task (reduces code duplication)
|
/// Helper function to spawn a validation task (reduces code duplication)
|
||||||
@@ -911,54 +960,9 @@ async fn save_company_core_data(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ProcessResult {
|
|
||||||
pub changes: Vec<CompanyEventChange>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn process_batch(
|
|
||||||
new_events: &[CompanyEvent],
|
|
||||||
existing: &mut HashMap<String, CompanyEvent>,
|
|
||||||
today: &str,
|
|
||||||
) -> ProcessResult {
|
|
||||||
let mut changes = Vec::new();
|
|
||||||
|
|
||||||
for new in new_events {
|
|
||||||
let key = event_key(new);
|
|
||||||
|
|
||||||
if let Some(old) = existing.get(&key) {
|
|
||||||
changes.extend(detect_changes(old, new, today));
|
|
||||||
existing.insert(key, new.clone());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let date_key = format!("{}|{}", new.ticker, new.date);
|
|
||||||
let mut found_old = None;
|
|
||||||
for (k, e) in existing.iter() {
|
|
||||||
if format!("{}|{}", e.ticker, e.date) == date_key && k != &key {
|
|
||||||
found_old = Some((k.clone(), e.clone()));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some((old_key, old_event)) = found_old {
|
|
||||||
if new.date.as_str() > today {
|
|
||||||
changes.push(CompanyEventChange {
|
|
||||||
ticker: new.ticker.clone(),
|
|
||||||
date: new.date.clone(),
|
|
||||||
field_changed: "time".to_string(),
|
|
||||||
old_value: old_event.time.clone(),
|
|
||||||
new_value: new.time.clone(),
|
|
||||||
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
existing.remove(&old_key);
|
|
||||||
}
|
|
||||||
|
|
||||||
existing.insert(key, new.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
ProcessResult { changes }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if a company needs processing (validation check)
|
/// Check if a company needs processing (validation check)
|
||||||
fn company_needs_processing(
|
fn company_needs_processing(
|
||||||
|
|||||||
546
src/main.rs
546
src/main.rs
@@ -1,217 +1,52 @@
|
|||||||
// src/main.rs - FIXED: Proper temp pool cleanup
|
// src/main.rs - Cleaned up version with extracted helpers
|
||||||
|
|
||||||
use web_scraper::{*, scraper, corporate};
|
use web_scraper::{*, scraper, corporate};
|
||||||
|
use crate::check_shutdown;
|
||||||
use anyhow::Result;
|
use anyhow::{Result};
|
||||||
use web_scraper::config::Config;
|
use web_scraper::config::Config;
|
||||||
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
|
use scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers};
|
||||||
use scraper::webdriver::ChromeDriverPool;
|
use scraper::webdriver::ChromeDriverPool;
|
||||||
use util::directories::DataPaths;
|
use util::directories::DataPaths;
|
||||||
use util::{logger, opnv};
|
use util::{logger, opnv};
|
||||||
|
use std::fs::{OpenOptions};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
#[tokio::main]
|
// ============================================================================
|
||||||
async fn main() -> Result<()> {
|
// HELPER FUNCTIONS - Extracted to reduce duplication
|
||||||
let output = if cfg!(target_os = "windows") {
|
// ============================================================================
|
||||||
Command::new("cmd")
|
|
||||||
|
/// Start Docker Desktop on Windows
|
||||||
|
async fn start_docker_desktop() {
|
||||||
|
if cfg!(target_os = "windows") {
|
||||||
|
let _ = Command::new("cmd")
|
||||||
.args(["/C", "docker desktop start"])
|
.args(["/C", "docker desktop start"])
|
||||||
.output()
|
.output();
|
||||||
.expect("failed to execute process")
|
|
||||||
} else {
|
|
||||||
Command::new("sh")
|
|
||||||
.arg("-c")
|
|
||||||
.arg("echo hello")
|
|
||||||
.output()
|
|
||||||
.expect("failed to execute process")
|
|
||||||
};
|
|
||||||
let _start_docker_desktop = output.stdout;
|
|
||||||
|
|
||||||
cleanup_all_proxy_containers().await.ok();
|
|
||||||
|
|
||||||
let config = match Config::load() {
|
|
||||||
Ok(cfg) => cfg,
|
|
||||||
Err(_) => {
|
|
||||||
eprintln!("Using default configuration");
|
|
||||||
Config::default()
|
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
let paths = DataPaths::new(".")?;
|
/// Shutdown ChromeDriver pool with error handling
|
||||||
|
async fn shutdown_chrome_pool(pool: &ChromeDriverPool) {
|
||||||
// Initialize monitoring system
|
|
||||||
let config_snapshot = ConfigSnapshot {
|
|
||||||
max_parallel_instances: config.max_parallel_instances,
|
|
||||||
max_tasks_per_instance: config.max_tasks_per_instance,
|
|
||||||
enable_vpn_rotation: config.enable_vpn_rotation,
|
|
||||||
max_requests_per_session: config.max_requests_per_session,
|
|
||||||
min_request_interval_ms: config.min_request_interval_ms,
|
|
||||||
max_retry_attempts: config.max_retry_attempts,
|
|
||||||
};
|
|
||||||
|
|
||||||
let (monitoring_handle, _monitoring_task) = init_monitoring(
|
|
||||||
config_snapshot,
|
|
||||||
paths.logs_dir().to_path_buf(),
|
|
||||||
3030, // Dashboard port
|
|
||||||
).await?;
|
|
||||||
|
|
||||||
// Emit pool initialization event
|
|
||||||
monitoring_handle.emit(monitoring::MonitoringEvent::PoolInitialized {
|
|
||||||
pool_size: config.max_parallel_instances,
|
|
||||||
with_proxy: config.enable_vpn_rotation,
|
|
||||||
with_rotation: config.max_tasks_per_instance > 0,
|
|
||||||
});
|
|
||||||
|
|
||||||
logger::log_info("Monitoring dashboard available at http://localhost:3030").await;
|
|
||||||
|
|
||||||
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
|
||||||
logger::log_info("=== Economic Webscraper Started ===").await;
|
|
||||||
logger::log_info(&format!(
|
|
||||||
"Config → parallel_instances: {}, task_limit: {} vpn_rotation: {} proxy_instances_per_certificate: {:?}",
|
|
||||||
config.max_parallel_instances,
|
|
||||||
config.max_tasks_per_instance,
|
|
||||||
config.enable_vpn_rotation,
|
|
||||||
config.proxy_instances_per_certificate
|
|
||||||
)).await;
|
|
||||||
|
|
||||||
let number_proxy_instances_per_certificate = config.proxy_instances_per_certificate.unwrap_or(1);
|
|
||||||
|
|
||||||
|
|
||||||
// Simple shutdown flag
|
|
||||||
let shutdown_flag = Arc::new(AtomicBool::new(false));
|
|
||||||
|
|
||||||
// === Step 1: Fetch VPNBook configs ===
|
|
||||||
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
|
|
||||||
logger::log_info("VPN Rotation Enabled – Fetching latest VPNBook configs").await;
|
|
||||||
|
|
||||||
// Create temp pool and ensure it's properly shut down
|
|
||||||
logger::log_info("Creating temporary ChromeDriver pool for VPN credential fetch...").await;
|
|
||||||
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
|
|
||||||
None,
|
|
||||||
&config,
|
|
||||||
Some(monitoring_handle.clone())
|
|
||||||
).await?);
|
|
||||||
|
|
||||||
logger::log_info("Fetching VPNBook credentials...").await;
|
|
||||||
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
|
||||||
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
|
||||||
|
|
||||||
// Properly shutdown temp pool with error handling
|
|
||||||
logger::log_info("Shutting down temporary pool...").await;
|
|
||||||
match temp_pool.shutdown().await {
|
|
||||||
Ok(()) => {
|
|
||||||
logger::log_info("✓ Temporary pool shut down successfully").await;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
logger::log_error(&format!("✗ Temp pool shutdown error: {}", e)).await;
|
|
||||||
// Force-kill as backup
|
|
||||||
#[cfg(target_os = "windows")]
|
|
||||||
{
|
|
||||||
let _ = tokio::process::Command::new("taskkill")
|
|
||||||
.args(["/F", "/IM", "chrome.exe"])
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
let _ = tokio::process::Command::new("taskkill")
|
|
||||||
.args(["/F", "/IM", "chromedriver.exe"])
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait a moment for cleanup
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
|
||||||
|
|
||||||
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
|
||||||
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
|
||||||
.count();
|
|
||||||
|
|
||||||
if server_count == 0 {
|
|
||||||
logger::log_warn("No VPN servers found – continuing without VPN").await;
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
logger::log_info(&format!("Found {} VPN servers – starting Docker proxy containers", server_count)).await;
|
|
||||||
let pp = Arc::new(DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password, number_proxy_instances_per_certificate).await?);
|
|
||||||
|
|
||||||
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
|
|
||||||
for i in 0..pp.num_proxies() {
|
|
||||||
if let Some(proxy_info) = pp.get_proxy_info(i) {
|
|
||||||
monitoring_handle.emit(monitoring::MonitoringEvent::ProxyConnected {
|
|
||||||
container_name: proxy_info.container_name.clone(),
|
|
||||||
ip_address: proxy_info.ip_address.clone(),
|
|
||||||
port: proxy_info.port,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(pp)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
logger::log_info("VPN rotation disabled – using direct connection").await;
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// === Step 2: Initialize ChromeDriver pool ===
|
|
||||||
let pool_size_limit = config.max_parallel_instances;
|
|
||||||
let task_limit = config.max_tasks_per_instance;
|
|
||||||
|
|
||||||
logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", pool_size_limit)).await;
|
|
||||||
|
|
||||||
let pool = Arc::new(
|
|
||||||
if task_limit > 0 {
|
|
||||||
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await?
|
|
||||||
} else {
|
|
||||||
ChromeDriverPool::new_with_proxy_and_task_limit(proxy_pool.clone(), &config, Some(monitoring_handle.clone())).await?
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
logger::log_info(&format!("ChromeDriver pool ready with {} instances", pool_size_limit)).await;
|
|
||||||
|
|
||||||
// === Step 3: Ctrl+C handler ===
|
|
||||||
{
|
|
||||||
let shutdown_flag_clone = Arc::clone(&shutdown_flag);
|
|
||||||
let pool_clone = Arc::clone(&pool);
|
|
||||||
let proxy_clone = proxy_pool.clone();
|
|
||||||
|
|
||||||
tokio::spawn(async move {
|
|
||||||
tokio::signal::ctrl_c().await.ok();
|
|
||||||
logger::log_info("Ctrl+C received – shutting down gracefully...").await;
|
|
||||||
|
|
||||||
// Set flag first
|
|
||||||
shutdown_flag_clone.store(true, Ordering::SeqCst);
|
|
||||||
|
|
||||||
// Wait a bit for tasks to notice
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
|
||||||
|
|
||||||
// ✅ FIXED: Better error handling during shutdown
|
|
||||||
logger::log_info("Shutting down ChromeDriver pool...").await;
|
logger::log_info("Shutting down ChromeDriver pool...").await;
|
||||||
match (&*pool_clone).shutdown().await {
|
match pool.shutdown().await {
|
||||||
Ok(()) => {
|
Ok(()) => logger::log_info("✓ ChromeDriver pool shut down successfully").await,
|
||||||
logger::log_info("✓ ChromeDriver pool shut down successfully").await;
|
Err(e) => logger::log_error(&format!("✗ Pool shutdown error: {}", e)).await,
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
logger::log_error(&format!("✗ Pool shutdown error: {}", e)).await;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(pp) = proxy_clone {
|
/// Shutdown Docker VPN proxy pool with error handling
|
||||||
|
async fn shutdown_proxy_pool(proxy_pool: &DockerVpnProxyPool) {
|
||||||
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
||||||
match pp.shutdown().await {
|
match proxy_pool.shutdown().await {
|
||||||
Ok(()) => {
|
Ok(()) => logger::log_info("✓ All Docker VPN containers stopped").await,
|
||||||
logger::log_info("✓ All Docker VPN containers stopped").await;
|
Err(e) => logger::log_error(&format!("✗ Proxy shutdown error: {}", e)).await,
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
logger::log_error(&format!("✗ Proxy shutdown error: {}", e)).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let _ = cleanup_all_proxy_containers().await;
|
/// Force-kill Chrome and ChromeDriver processes (Windows only)
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
// ✅ ADDED: Force-kill any remaining Chrome/ChromeDriver processes
|
async fn force_kill_chrome_processes() {
|
||||||
#[cfg(target_os = "windows")]
|
|
||||||
{
|
|
||||||
logger::log_info("Force-killing any remaining Chrome processes...").await;
|
logger::log_info("Force-killing any remaining Chrome processes...").await;
|
||||||
let _ = tokio::process::Command::new("taskkill")
|
let _ = tokio::process::Command::new("taskkill")
|
||||||
.args(["/F", "/IM", "chrome.exe"])
|
.args(["/F", "/IM", "chrome.exe"])
|
||||||
@@ -221,67 +56,16 @@ async fn main() -> Result<()> {
|
|||||||
.args(["/F", "/IM", "chromedriver.exe"])
|
.args(["/F", "/IM", "chromedriver.exe"])
|
||||||
.output()
|
.output()
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info("Shutdown complete").await;
|
#[cfg(not(target_os = "windows"))]
|
||||||
std::process::exit(0);
|
async fn force_kill_chrome_processes() {
|
||||||
});
|
// No-op on non-Windows platforms
|
||||||
}
|
}
|
||||||
|
|
||||||
// === Step 4: Run scraping jobs ===
|
/// Verify Chrome processes are cleaned up (Windows only)
|
||||||
if !shutdown_flag.load(Ordering::SeqCst) {
|
#[cfg(target_os = "windows")]
|
||||||
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
async fn verify_chrome_cleanup() {
|
||||||
economic::run_full_update(&config, &pool, &shutdown_flag).await?;
|
|
||||||
logger::log_info("Economic update completed").await;
|
|
||||||
}
|
|
||||||
|
|
||||||
if !shutdown_flag.load(Ordering::SeqCst) {
|
|
||||||
logger::log_info("--- Starting CORPORATE data update ---").await;
|
|
||||||
corporate::run_full_update(&config, &pool, &shutdown_flag).await?;
|
|
||||||
logger::log_info("Corporate update completed").await;
|
|
||||||
}
|
|
||||||
|
|
||||||
// === Step 5: Final cleanup ===
|
|
||||||
if !shutdown_flag.load(Ordering::SeqCst) {
|
|
||||||
logger::log_info("Shutting down ChromeDriver pool...").await;
|
|
||||||
match pool.shutdown().await {
|
|
||||||
Ok(()) => {
|
|
||||||
logger::log_info("✓ ChromeDriver pool shut down successfully").await;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
logger::log_error(&format!("✗ Pool shutdown error: {}", e)).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(pp) = proxy_pool {
|
|
||||||
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
|
||||||
match pp.shutdown().await {
|
|
||||||
Ok(()) => {
|
|
||||||
logger::log_info("✓ All Docker VPN containers stopped").await;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
logger::log_error(&format!("✗ Proxy shutdown error: {}", e)).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cleanup_all_proxy_containers().await.ok();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Final force-kill to ensure no leaks
|
|
||||||
#[cfg(target_os = "windows")]
|
|
||||||
{
|
|
||||||
logger::log_info("Final cleanup: force-killing any remaining Chrome processes...").await;
|
|
||||||
tokio::process::Command::new("taskkill")
|
|
||||||
.args(["/F", "/IM", "chrome.exe"])
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok();
|
|
||||||
tokio::process::Command::new("taskkill")
|
|
||||||
.args(["/F", "/IM", "chromedriver.exe"])
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok();
|
|
||||||
|
|
||||||
// Verify cleanup
|
|
||||||
if let Ok(output) = tokio::process::Command::new("tasklist")
|
if let Ok(output) = tokio::process::Command::new("tasklist")
|
||||||
.args(["/FI", "IMAGENAME eq chrome.exe"])
|
.args(["/FI", "IMAGENAME eq chrome.exe"])
|
||||||
.output()
|
.output()
|
||||||
@@ -296,10 +80,262 @@ async fn main() -> Result<()> {
|
|||||||
logger::log_info("✓ All Chrome processes cleaned up").await;
|
logger::log_info("✓ All Chrome processes cleaned up").await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "windows"))]
|
||||||
|
async fn verify_chrome_cleanup() {
|
||||||
|
// No-op on non-Windows platforms
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Complete cleanup sequence: shutdown pools, cleanup containers, kill processes
|
||||||
|
async fn perform_full_cleanup(
|
||||||
|
pool: &ChromeDriverPool,
|
||||||
|
proxy_pool: Option<&DockerVpnProxyPool>,
|
||||||
|
) {
|
||||||
|
shutdown_chrome_pool(pool).await;
|
||||||
|
|
||||||
|
if let Some(pp) = proxy_pool {
|
||||||
|
shutdown_proxy_pool(pp).await;
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info("=== Application finished successfully ===").await;
|
force_kill_chrome_processes().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create temporary ChromeDriver pool, fetch VPN credentials, and cleanup
|
||||||
|
async fn fetch_vpn_credentials_with_temp_pool(
|
||||||
|
config: &Config,
|
||||||
|
paths: &DataPaths,
|
||||||
|
monitoring_handle: &monitoring::MonitoringHandle,
|
||||||
|
) -> Result<Option<Arc<DockerVpnProxyPool>>> {
|
||||||
|
logger::log_info("VPN Rotation Enabled – Fetching latest VPNBook configs").await;
|
||||||
|
|
||||||
|
// Create temp pool
|
||||||
|
logger::log_info("Creating temporary ChromeDriver pool for VPN credential fetch...").await;
|
||||||
|
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
|
||||||
|
None,
|
||||||
|
config,
|
||||||
|
Some(monitoring_handle.clone())
|
||||||
|
).await?);
|
||||||
|
|
||||||
|
// Fetch credentials
|
||||||
|
logger::log_info("Fetching VPNBook credentials...").await;
|
||||||
|
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
||||||
|
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
||||||
|
|
||||||
|
// Cleanup temp pool
|
||||||
|
logger::log_info("Shutting down temporary pool...").await;
|
||||||
|
match temp_pool.shutdown().await {
|
||||||
|
Ok(()) => logger::log_info("✓ Temporary pool shut down successfully").await,
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_error(&format!("✗ Temp pool shutdown error: {}", e)).await;
|
||||||
|
force_kill_chrome_processes().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
// Count VPN servers and create proxy pool
|
||||||
|
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
||||||
|
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if server_count == 0 {
|
||||||
|
logger::log_warn("No VPN servers found – continuing without VPN").await;
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Found {} VPN servers – starting Docker proxy containers", server_count)).await;
|
||||||
|
|
||||||
|
let number_proxy_instances = config.proxy_instances_per_certificate.unwrap_or(1);
|
||||||
|
let proxy_pool = Arc::new(DockerVpnProxyPool::new(
|
||||||
|
paths.cache_openvpn_dir(),
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
number_proxy_instances
|
||||||
|
).await?);
|
||||||
|
|
||||||
|
logger::log_info(&format!("All {} Docker proxy containers started and ready", proxy_pool.num_proxies())).await;
|
||||||
|
|
||||||
|
// Emit proxy connection events
|
||||||
|
for i in 0..proxy_pool.num_proxies() {
|
||||||
|
if let Some(proxy_info) = proxy_pool.get_proxy_info(i) {
|
||||||
|
monitoring_handle.emit(monitoring::MonitoringEvent::ProxyConnected {
|
||||||
|
container_name: proxy_info.container_name.clone(),
|
||||||
|
ip_address: proxy_info.ip_address.clone(),
|
||||||
|
port: proxy_info.port,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(proxy_pool))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialize monitoring system
|
||||||
|
async fn initialize_monitoring(
|
||||||
|
config: &Config,
|
||||||
|
paths: &DataPaths,
|
||||||
|
) -> Result<(monitoring::MonitoringHandle, tokio::task::JoinHandle<()>)> {
|
||||||
|
let config_snapshot = ConfigSnapshot {
|
||||||
|
max_parallel_instances: config.max_parallel_instances,
|
||||||
|
max_tasks_per_instance: config.max_tasks_per_instance,
|
||||||
|
enable_vpn_rotation: config.enable_vpn_rotation,
|
||||||
|
max_requests_per_session: config.max_requests_per_session,
|
||||||
|
min_request_interval_ms: config.min_request_interval_ms,
|
||||||
|
max_retry_attempts: config.max_retry_attempts,
|
||||||
|
};
|
||||||
|
|
||||||
|
let (monitoring_handle, monitoring_task) = init_monitoring(
|
||||||
|
config_snapshot,
|
||||||
|
paths.logs_dir().to_path_buf(),
|
||||||
|
3030,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
monitoring_handle.emit(monitoring::MonitoringEvent::PoolInitialized {
|
||||||
|
pool_size: config.max_parallel_instances,
|
||||||
|
with_proxy: config.enable_vpn_rotation,
|
||||||
|
with_rotation: config.max_tasks_per_instance > 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
logger::log_info("Monitoring dashboard available at http://localhost:3030").await;
|
||||||
|
|
||||||
|
Ok((monitoring_handle, monitoring_task))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Setup Ctrl+C handler for graceful shutdown
|
||||||
|
fn setup_shutdown_handler(
|
||||||
|
shutdown_flag: Arc<AtomicBool>,
|
||||||
|
pool: Arc<ChromeDriverPool>,
|
||||||
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
|
) {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
tokio::signal::ctrl_c().await.ok();
|
||||||
|
logger::log_info("Ctrl+C received – shutting down gracefully...").await;
|
||||||
|
|
||||||
|
shutdown_flag.store(true, Ordering::SeqCst);
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
perform_full_cleanup(&pool, proxy_pool.as_deref()).await;
|
||||||
|
|
||||||
|
logger::log_info("Shutdown complete").await;
|
||||||
|
std::process::exit(0);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn format_duration(duration: Duration) -> String {
|
||||||
|
let total_seconds = duration.as_secs();
|
||||||
|
|
||||||
|
let days = total_seconds / 86400;
|
||||||
|
let hours = (total_seconds % 86400) / 3600;
|
||||||
|
let minutes = (total_seconds % 3600) / 60;
|
||||||
|
let seconds = total_seconds % 60;
|
||||||
|
|
||||||
|
format!("{:02}::{:02}::{:02}::{:02}", days, hours, minutes, seconds)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn create_state_files(paths: &DataPaths) -> Result<()> {
|
||||||
|
let paths = (
|
||||||
|
paths.data_dir().join("state.jsonl"),
|
||||||
|
paths.cache_dir().join("state.jsonl"),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Use OpenOptions to create the file only if it doesn't exist
|
||||||
|
for path in &[&paths.0, &paths.1] {
|
||||||
|
OpenOptions::new()
|
||||||
|
.create(true) // Create if it doesn't exist
|
||||||
|
.write(true) // Ensure we can write to the file
|
||||||
|
.open(path)?;
|
||||||
|
logger::log_info(&format!("Checked or created file: {}", path.display())).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// MAIN FUNCTION - Simplified with extracted helpers
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
// Initial setup
|
||||||
|
let start = Instant::now();
|
||||||
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
|
start_docker_desktop().await;
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
create_state_files(&paths).await.ok();
|
||||||
|
|
||||||
|
let config = Config::load().unwrap_or_else(|_| {
|
||||||
|
eprintln!("Using default configuration");
|
||||||
|
Config::default()
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Initialize monitoring
|
||||||
|
let (monitoring_handle, _monitoring_task) = initialize_monitoring(&config, &paths).await?;
|
||||||
|
|
||||||
|
// Initialize debug logger
|
||||||
|
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
||||||
|
logger::log_info("=== Economic Webscraper Started ===").await;
|
||||||
|
logger::log_info(&format!(
|
||||||
|
"Config → parallel_instances: {}, task_limit: {}, vpn_rotation: {}, proxy_instances_per_certificate: {:?}",
|
||||||
|
config.max_parallel_instances,
|
||||||
|
config.max_tasks_per_instance,
|
||||||
|
config.enable_vpn_rotation,
|
||||||
|
config.proxy_instances_per_certificate
|
||||||
|
)).await;
|
||||||
|
|
||||||
|
let shutdown_flag = Arc::new(AtomicBool::new(false));
|
||||||
|
|
||||||
|
// Fetch VPN credentials and setup proxy pool if enabled
|
||||||
|
let proxy_pool = if config.enable_vpn_rotation {
|
||||||
|
fetch_vpn_credentials_with_temp_pool(&config, &paths, &monitoring_handle).await?
|
||||||
|
} else {
|
||||||
|
logger::log_info("VPN rotation disabled – using direct connection").await;
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create main ChromeDriver pool
|
||||||
|
logger::log_info(&format!("Creating ChromeDriver pool with {} instances...", config.max_parallel_instances)).await;
|
||||||
|
|
||||||
|
let pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
|
||||||
|
proxy_pool.clone(),
|
||||||
|
&config,
|
||||||
|
Some(monitoring_handle.clone())
|
||||||
|
).await?);
|
||||||
|
|
||||||
|
logger::log_info(&format!("ChromeDriver pool ready with {} instances", config.max_parallel_instances)).await;
|
||||||
|
|
||||||
|
// Setup Ctrl+C handler
|
||||||
|
setup_shutdown_handler(
|
||||||
|
Arc::clone(&shutdown_flag),
|
||||||
|
Arc::clone(&pool),
|
||||||
|
proxy_pool.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Run scraping jobs
|
||||||
|
check_shutdown!(&shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
||||||
|
economic::run_full_update(&config, &pool, &shutdown_flag).await?;
|
||||||
|
logger::log_info("Economic update completed").await;
|
||||||
|
|
||||||
|
check_shutdown!(&shutdown_flag);
|
||||||
|
|
||||||
|
logger::log_info("--- Starting CORPORATE data update ---").await;
|
||||||
|
corporate::run_full_update(&config, &pool, &shutdown_flag).await?;
|
||||||
|
logger::log_info("Corporate update completed").await;
|
||||||
|
|
||||||
|
check_shutdown!(&shutdown_flag);
|
||||||
|
|
||||||
|
// Final cleanup if not already shutting down
|
||||||
|
perform_full_cleanup(&pool, proxy_pool.as_deref()).await;
|
||||||
|
verify_chrome_cleanup().await;
|
||||||
|
|
||||||
|
logger::log_info(&format!("=== Application finished after {} ===", format_duration(start.elapsed()))).await;
|
||||||
|
|
||||||
|
logger::log_info("=== Application finished successfully ===").await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
macro_rules! check_shutdown {
|
macro_rules! check_shutdown {
|
||||||
($shutdown_flag:expr) => {
|
($shutdown_flag:expr) => {
|
||||||
if $shutdown_flag.load(std::sync::atomic::Ordering::SeqCst) {
|
if $shutdown_flag.load(std::sync::atomic::Ordering::SeqCst) {
|
||||||
logger::log_warn("Shutdown detected, stopping update").await;
|
logger::log_warn("Shutdown detected, stopping processes").await;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user