added working hard reset

This commit is contained in:
2025-12-23 15:07:40 +01:00
parent fb0876309f
commit f9f09d0291
5 changed files with 666 additions and 127 deletions

View File

@@ -1,5 +1,5 @@
// src/corporate/update.rs - UPDATED WITH DATA INTEGRITY FIXES
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*};
use crate::config::Config;
use crate::corporate::update_parallel::build_companies_jsonl_streaming_parallel;
use crate::util::directories::DataPaths;
@@ -11,7 +11,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
/// UPDATED: Main corporate update entry point with shutdown awareness
/// Main corporate update entry point with shutdown awareness
pub async fn run_full_update(
_config: &Config,
pool: &Arc<ChromeDriverPool>,
@@ -81,8 +81,16 @@ pub async fn run_full_update(
let count = build_companies_jsonl_streaming_parallel(&paths, pool, shutdown_flag, _config, &None).await?;
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected after companies.jsonl build").await;
return Ok(());
}
logger::log_info("Step 6: Cleansing up companies with missing essential data...").await;
let cleansed_count = companies_yahoo_jsonl(&paths).await?;
if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 6: Processing events (using index)...").await;
logger::log_info("Step 7: Processing events (using index)...").await;
let _event_index = build_event_index(&paths).await?;
logger::log_info(" ✓ Event index built").await;
} else {
@@ -93,6 +101,91 @@ pub async fn run_full_update(
Ok(())
}
/// Cleansing function to remove companies with missing essential yahoo data for integrity
/// Has to contain a ticker with 'YAHOO:'; Entries with 'YAHOO:NO_RESULTS' are removed
/// The rest stays unchanged
///
/// The '.jsonl' will be saved in the same path but 'companies_filtered.jsonl'
/// Only execute when 'companies.jsonl' is present
pub async fn companies_yahoo_jsonl(paths: &DataPaths) -> anyhow::Result<usize> {
use tokio::fs::File;
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
let path = paths.base_dir();
let input_path = path.join("corporate").join("companies.jsonl");
let output_path = path.join("corporate").join("companies_yahoo.jsonl");
// Check if input file exists
if !input_path.exists() {
logger::log_warn("companies.jsonl not found, skipping cleansing").await;
return Ok(0);
}
logger::log_info(&format!(" Reading from: {:?}", input_path)).await;
logger::log_info(&format!(" Writing to: {:?}", output_path)).await;
let file = File::open(&input_path).await?;
let reader = BufReader::new(file);
let mut lines = reader.lines();
let mut output_file = File::create(&output_path).await?;
let mut valid_count = 0;
let mut removed_count = 0;
let mut total_count = 0;
while let Some(line) = lines.next_line().await? {
if line.trim().is_empty() {
continue;
}
total_count += 1;
let company: CompanyCrossPlatformInfo = match serde_json::from_str(&line) {
Ok(c) => c,
Err(e) => {
logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await;
continue;
}
};
// Check if company has at least one valid YAHOO ticker
// Valid means: starts with "YAHOO:" but is NOT "YAHOO:NO_RESULTS"
let has_valid_yahoo = company.isin_tickers_map
.values()
.flatten()
.any(|ticker| ticker.starts_with("YAHOO:") && ticker != "YAHOO:NO_RESULTS");
if has_valid_yahoo {
// Write the company to the filtered output
let json_line = serde_json::to_string(&company)?;
output_file.write_all(json_line.as_bytes()).await?;
output_file.write_all(b"\n").await?;
valid_count += 1;
} else {
removed_count += 1;
if removed_count <= 5 {
// Log first few removals for debugging
logger::log_info(&format!(" Removed company '{}' (no valid Yahoo ticker)", company.name)).await;
}
}
// Progress indicator for large files
if total_count % 1000 == 0 {
logger::log_info(&format!(" Processed {} companies...", total_count)).await;
}
}
output_file.flush().await?;
logger::log_info(&format!(
" ✓ Cleansing complete: {} total → {} valid, {} removed",
total_count, valid_count, removed_count
)).await;
Ok(valid_count)
}
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();

View File

@@ -1,11 +1,7 @@
// src/corporate/update_parallel.rs - FIXED: Proper Hard Reset Implementation
// src/corporate/update_parallel.rs - PROPERLY FIXED: Correct pending queue rebuild
//
// Critical fixes:
// 1. Hard reset actually performed (no premature break)
// 2. Error counter reset after hard reset
// 3. Per-ISIN status tracking (not per-company)
// 4. Proper task draining before reset
// 5. Queue rebuilding after reset
// Critical fix: After hard reset, only skip companies with COMPLETE Yahoo data
// Not just companies that have been written
use super::{types::*, yahoo::*, helpers::*};
use crate::util::directories::DataPaths;
@@ -38,6 +34,53 @@ struct CompanyProcessResult {
is_update: bool,
}
/// Check if a company needs Yahoo data processing
/// Returns true if company has incomplete data (needs processing)
fn company_needs_processing(
company_name: &str,
company_info: &CompanyInfo,
existing_companies: &HashMap<String, CompanyCrossPlatformInfo>,
) -> bool {
// If company not in existing data at all, definitely needs processing
let Some(existing_entry) = existing_companies.get(company_name) else {
return true;
};
// Collect all ISINs this company should have
let mut required_isins = std::collections::HashSet::new();
for figi_infos in company_info.securities.values() {
for figi_info in figi_infos {
if !figi_info.isin.is_empty() {
required_isins.insert(figi_info.isin.clone());
}
}
}
// Check each required ISIN
for isin in required_isins {
// Check if this ISIN exists in the company's ticker map
if let Some(tickers) = existing_entry.isin_tickers_map.get(&isin) {
// Check if this ISIN has valid Yahoo data
let has_valid_yahoo = tickers.iter().any(|t| {
t.starts_with("YAHOO:") &&
t != "YAHOO:ERROR" && // Error marker means needs retry
t != "YAHOO:NO_RESULTS" // This is actually valid (legitimately not found)
});
// If no valid Yahoo data for this ISIN, company needs processing
if !has_valid_yahoo {
return true;
}
} else {
// ISIN not in map at all, needs processing
return true;
}
}
// All ISINs have valid Yahoo data, skip this company
false
}
/// Abort-safe incremental JSONL persistence with proper hard reset handling
pub async fn build_companies_jsonl_streaming_parallel(
paths: &DataPaths,
@@ -64,14 +107,13 @@ pub async fn build_companies_jsonl_streaming_parallel(
let path = DataPaths::new(".")?;
let corporate_path = path.data_dir().join("corporate").join("by_name");
let securities_path = corporate_path.join("common_stocks.json");
let securities_path_cloned = securities_path.clone();
if !securities_path.exists() {
logger::log_warn("No common_stocks.json found").await;
return Ok(0);
}
let content = tokio::fs::read_to_string(securities_path).await?;
let content = tokio::fs::read_to_string(&securities_path).await?;
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
let companies_path = paths.data_dir().join("companies.jsonl");
@@ -145,7 +187,9 @@ pub async fn build_companies_jsonl_streaming_parallel(
let companies_path_clone = companies_path.clone();
let log_path_clone = log_path.clone();
let existing_companies_writer = Arc::new(tokio::sync::Mutex::new(existing_companies.clone()));
let existing_companies_writer_clone = Arc::clone(&existing_companies_writer);
// Clone the Arc for the writer task (Arc clone is cheap, just increments ref count)
let existing_companies_writer_for_task = Arc::clone(&existing_companies_writer);
let write_tx_for_writer = write_tx.clone();
let writer_task = tokio::spawn(async move {
@@ -176,7 +220,7 @@ pub async fn build_companies_jsonl_streaming_parallel(
count += 1;
// Update in-memory state
let mut existing_companies = existing_companies_writer.lock().await;
let mut existing_companies = existing_companies_writer_for_task.lock().await;
let is_update = existing_companies.contains_key(&company.name);
existing_companies.insert(company.name.clone(), company);
drop(existing_companies);
@@ -214,7 +258,7 @@ pub async fn build_companies_jsonl_streaming_parallel(
break;
}
let existing_companies = existing_companies_writer.lock().await;
let existing_companies = existing_companies_writer_for_task.lock().await;
let companies_vec: Vec<_> = existing_companies.values().cloned().collect();
drop(existing_companies);
@@ -291,7 +335,19 @@ pub async fn build_companies_jsonl_streaming_parallel(
logger::log_info(&format!("Processing {} companies with concurrency limit {}", total, CONCURRENCY_LIMIT)).await;
let mut tasks = FuturesUnordered::new();
let mut pending = securities.into_iter().collect::<Vec<_>>();
// Build initial pending list with proper filtering
let mut pending: Vec<(String, CompanyInfo)> = securities.iter()
.filter(|(name, info)| company_needs_processing(name, info, &existing_companies))
.map(|(name, info)| (name.clone(), info.clone()))
.collect();
logger::log_info(&format!(
"Initial scan: {} companies need processing ({} already complete)",
pending.len(),
total - pending.len()
)).await;
let mut processed = 0;
let mut hard_reset_count = 0;
@@ -397,7 +453,7 @@ pub async fn build_companies_jsonl_streaming_parallel(
let error_msg = e.to_string();
if error_msg.contains("HARD_RESET_REQUIRED") {
// ✅ FIX: Don't break, perform actual hard reset
// Don't break, perform actual hard reset
// Check if reset already in progress (race condition protection)
let mut reset_lock = reset_in_progress.lock().await;
@@ -439,7 +495,7 @@ pub async fn build_companies_jsonl_streaming_parallel(
logger::log_info("✅ Hard reset completed successfully").await;
hard_reset_count += 1;
// ✅ FIX: Reset the error counter
// Reset the error counter
{
let pool_guard = pool_mutex.lock().await;
let current_pool = Arc::clone(&*pool_guard);
@@ -447,24 +503,24 @@ pub async fn build_companies_jsonl_streaming_parallel(
}
logger::log_info("✓ Error counter cleared").await;
// ✅ FIX: Rebuild pending list from existing_companies
// Only re-add companies that haven't been written yet
let written_companies = {
let companies = existing_companies_writer_clone.lock().await;
companies.keys().cloned().collect::<std::collections::HashSet<_>>()
// Rebuild pending list by checking which companies need processing
logger::log_info("Rebuilding pending queue with proper Yahoo data checks...").await;
// Get current state of written companies
let current_existing = {
let companies = existing_companies_writer.lock().await;
companies.clone()
};
// Create new pending list: all companies minus those already written
let all_companies_list: Vec<(String, CompanyInfo)> = {
// Need to reload securities since we cleared pending
let content = tokio::fs::read_to_string(&securities_path_cloned).await?;
let all_securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
all_securities.into_iter()
.filter(|(name, _)| !written_companies.contains(name))
.collect()
};
// Reload all securities from disk
let content = tokio::fs::read_to_string(&securities_path).await?;
let all_securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
pending = all_companies_list;
// Build pending list: only companies that need processing
pending = all_securities.iter()
.filter(|(name, info)| company_needs_processing(name, info, &current_existing))
.map(|(name, info)| (name.clone(), info.clone()))
.collect();
logger::log_info(&format!(
"Restarting with {} remaining companies (out of {} total)",
@@ -472,6 +528,18 @@ pub async fn build_companies_jsonl_streaming_parallel(
total
)).await;
// Only continue if there's work to do
if pending.is_empty() {
logger::log_info("All companies have complete data, exiting").await;
// Clear reset flag
let mut reset_lock = reset_in_progress.lock().await;
*reset_lock = false;
drop(reset_lock);
break; // Exit main loop
}
// Respawn initial batch with NEW pool
for _ in 0..CONCURRENCY_LIMIT.min(pending.len()) {
if let Some((name, company_info)) = pending.pop() {
@@ -695,7 +763,7 @@ async fn process_single_company_validated(
}
}
// ✅ FIX: Process each ISIN independently with per-ISIN status checking
// Process each ISIN independently with per-ISIN status checking
for (isin, figi_tickers) in unique_isin_ticker_pairs {
// Check shutdown before each ISIN
if shutdown_flag.load(Ordering::SeqCst) {
@@ -716,10 +784,13 @@ async fn process_single_company_validated(
}
}
// ✅ FIX: Check if THIS SPECIFIC ISIN has Yahoo data
let has_yahoo_ticker_for_this_isin = tickers.iter().any(|t| t.starts_with("YAHOO:"));
// Check if THIS SPECIFIC ISIN has valid Yahoo data (not ERROR)
let has_valid_yahoo = tickers.iter().any(|t| {
t.starts_with("YAHOO:") && t != "YAHOO:ERROR"
// Note: YAHOO:NO_RESULTS is valid (legitimately not found)
});
if !has_yahoo_ticker_for_this_isin {
if !has_valid_yahoo {
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
match scrape_with_retry(pool, &isin, 3, shutdown_flag).await {
@@ -766,7 +837,7 @@ async fn process_single_company_validated(
isin, name, e
)).await;
// ✅ FIX: Mark this ISIN as failed to enable retry
// Mark this ISIN as failed to enable retry
tickers.push("YAHOO:ERROR".to_string());
}
}

View File

@@ -1,4 +1,4 @@
// src/main.rs
// src/main.rs - FIXED: Proper temp pool cleanup
use web_scraper::{*, scraper, economic, corporate};
@@ -84,11 +84,45 @@ async fn main() -> Result<()> {
// === Step 1: Fetch VPNBook configs ===
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
logger::log_info("VPN Rotation Enabled Fetching latest VPNBook configs").await;
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(None, &config, Some(monitoring_handle.clone())).await?);
// Create temp pool and ensure it's properly shut down
logger::log_info("Creating temporary ChromeDriver pool for VPN credential fetch...").await;
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(
None,
&config,
Some(monitoring_handle.clone())
).await?);
logger::log_info("Fetching VPNBook credentials...").await;
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
// Properly shutdown temp pool with error handling
logger::log_info("Shutting down temporary pool...").await;
match temp_pool.shutdown().await {
Ok(()) => {
logger::log_info("✓ Temporary pool shut down successfully").await;
}
Err(e) => {
logger::log_error(&format!("✗ Temp pool shutdown error: {}", e)).await;
// Force-kill as backup
#[cfg(target_os = "windows")]
{
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await;
}
}
}
// Wait a moment for cleanup
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
.filter(|e| e.as_ref().unwrap().path().is_dir())
.count();
@@ -150,20 +184,46 @@ async fn main() -> Result<()> {
// Wait a bit for tasks to notice
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// Cleanup
if let Err(e) = (&*pool_clone).shutdown().await {
logger::log_error(&format!("Error during pool shutdown: {}", e)).await;
// ✅ FIXED: Better error handling during shutdown
logger::log_info("Shutting down ChromeDriver pool...").await;
match (&*pool_clone).shutdown().await {
Ok(()) => {
logger::log_info("✓ ChromeDriver pool shut down successfully").await;
}
Err(e) => {
logger::log_error(&format!("✗ Pool shutdown error: {}", e)).await;
}
}
if let Some(pp) = proxy_clone {
if let Err(e) = pp.shutdown().await {
logger::log_warn(&format!("Failed to stop Docker containers: {}", e)).await;
} else {
logger::log_info("All Docker VPN containers stopped").await;
logger::log_info("Stopping Docker VPN proxy containers...").await;
match pp.shutdown().await {
Ok(()) => {
logger::log_info("All Docker VPN containers stopped").await;
}
Err(e) => {
logger::log_error(&format!("✗ Proxy shutdown error: {}", e)).await;
}
}
}
let _ = cleanup_all_proxy_containers().await;
// ✅ ADDED: Force-kill any remaining Chrome/ChromeDriver processes
#[cfg(target_os = "windows")]
{
logger::log_info("Force-killing any remaining Chrome processes...").await;
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await;
}
logger::log_info("Shutdown complete").await;
std::process::exit(0);
});
}
@@ -182,14 +242,60 @@ async fn main() -> Result<()> {
// === Step 5: Final cleanup ===
if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Shutting down ChromeDriver pool...").await;
pool.shutdown().await?;
match pool.shutdown().await {
Ok(()) => {
logger::log_info("✓ ChromeDriver pool shut down successfully").await;
}
Err(e) => {
logger::log_error(&format!("✗ Pool shutdown error: {}", e)).await;
}
}
if let Some(pp) = proxy_pool {
logger::log_info("Stopping Docker VPN proxy containers...").await;
pp.shutdown().await?;
match pp.shutdown().await {
Ok(()) => {
logger::log_info("✓ All Docker VPN containers stopped").await;
}
Err(e) => {
logger::log_error(&format!("✗ Proxy shutdown error: {}", e)).await;
}
}
cleanup_all_proxy_containers().await.ok();
}
// ✅ ADDED: Final force-kill to ensure no leaks
#[cfg(target_os = "windows")]
{
logger::log_info("Final cleanup: force-killing any remaining Chrome processes...").await;
tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await
.ok();
tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await
.ok();
// Verify cleanup
if let Ok(output) = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chrome.exe"])
.output()
.await
{
let stdout = String::from_utf8_lossy(&output.stdout);
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
if chrome_count > 0 {
logger::log_warn(&format!("⚠️ {} Chrome processes still running after cleanup!", chrome_count)).await;
} else {
logger::log_info("✓ All Chrome processes cleaned up").await;
}
}
}
logger::log_info("=== Application finished successfully ===").await;
}

View File

@@ -1,4 +1,4 @@
// src/scraper/hard_reset.rs - PROPERLY FIXED: Matches main.rs initialization pattern
// src/scraper/hard_reset.rs - FIXED: Proper cleanup without Arc leaks
use std::sync::{Arc, atomic::{AtomicBool, AtomicUsize, Ordering}};
use crate::{ChromeDriverPool, Config, logger, scraper::docker_vpn_proxy::{DockerVpnProxyPool, cleanup_all_proxy_containers}, util::directories::DataPaths};
@@ -36,7 +36,13 @@ impl HardResetController {
}
}
/// Perform hard reset: shutdown everything and recreate
/// ✅ FIXED: Perform hard reset without Arc reference leaks
///
/// Key improvements:
/// 1. Don't clone old_pool - just shutdown through mutex guard
/// 2. Verify all processes killed before creating new pool
/// 3. Explicitly shutdown temp pools with error handling
/// 4. Add process counting/verification
pub async fn perform_hard_reset(
pool_mutex: &Arc<tokio::sync::Mutex<Arc<ChromeDriverPool>>>,
config: &Config,
@@ -53,65 +59,143 @@ pub async fn perform_hard_reset(
return Ok(());
}
// Step 1: Acquire pool lock (prevents new tasks from using it)
logger::log_info(" [1/10] Acquiring pool lock...").await;
let mut pool_guard = pool_mutex.lock().await;
let old_pool = Arc::clone(&*pool_guard);
// Step 2: Wait a moment for active tasks to complete
logger::log_info(" [2/10] Waiting 10 seconds for active tasks...").await;
drop(pool_guard); // Release lock so tasks can finish
tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
// Re-acquire lock
// ===== STEP 1: ACQUIRE POOL LOCK (NO CLONING!) =====
logger::log_info(" [1/12] Acquiring pool lock...").await;
let mut pool_guard = pool_mutex.lock().await;
// Step 3: Shutdown ChromeDriver pool
logger::log_info(" [3/10] Shutting down ChromeDriver pool...").await;
if let Err(e) = old_pool.shutdown().await {
logger::log_warn(&format!(" Warning: Pool shutdown error: {}", e)).await;
// Get instance count before shutdown for verification
let old_instance_count = pool_guard.get_number_of_instances();
logger::log_info(&format!(" [1/12] Pool has {} instances", old_instance_count)).await;
// ===== STEP 2: SHUTDOWN OLD POOL (NO ARC CLONE!) =====
logger::log_info(" [2/12] Shutting down old pool (NO Arc clone)...").await;
// Shutdown through the Arc without cloning it
// This is safe because we hold the mutex lock
match pool_guard.shutdown().await {
Ok(()) => {
logger::log_info(" [2/12] ✓ Pool shutdown complete").await;
}
Err(e) => {
logger::log_error(&format!(" [2/12] ✗ Pool shutdown error: {}", e)).await;
// Continue anyway - we'll force-kill processes
}
}
// Step 4: Shutdown proxies
logger::log_info(" [4/10] Shutting down proxy containers...").await;
// ===== STEP 3: FORCE-KILL ANY REMAINING CHROME PROCESSES =====
logger::log_info(" [3/12] Force-killing any remaining Chrome/ChromeDriver processes...").await;
#[cfg(target_os = "windows")]
{
// Kill all chrome.exe processes
let chrome_result = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
match chrome_result {
Ok(output) if output.status.success() => {
logger::log_info(" [3/12] ✓ Chrome processes killed").await;
}
_ => {
logger::log_info(" [3/12] ⊘ No Chrome processes found").await;
}
}
// Kill all chromedriver.exe processes
let chromedriver_result = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await;
match chromedriver_result {
Ok(output) if output.status.success() => {
logger::log_info(" [3/12] ✓ ChromeDriver processes killed").await;
}
_ => {
logger::log_info(" [3/12] ⊘ No ChromeDriver processes found").await;
}
}
}
#[cfg(not(target_os = "windows"))]
{
// Kill all chrome processes
let _ = tokio::process::Command::new("pkill")
.arg("chrome")
.output()
.await;
let _ = tokio::process::Command::new("pkill")
.arg("chromedriver")
.output()
.await;
logger::log_info(" [3/12] ✓ Force-killed Chrome/ChromeDriver").await;
}
// ===== STEP 4: SHUTDOWN PROXIES =====
logger::log_info(" [4/12] Shutting down proxy containers...").await;
cleanup_all_proxy_containers().await.ok();
// Step 5: Wait for cleanup
logger::log_info(" [5/10] Waiting 30 seconds for cleanup...").await;
// ===== STEP 5: WAIT FOR CLEANUP =====
logger::log_info(" [5/12] Waiting 30 seconds for cleanup...").await;
tokio::time::sleep(tokio::time::Duration::from_secs(30)).await;
// ===== STEP 6: VERIFY CLEANUP =====
logger::log_info(" [6/12] Verifying process cleanup...").await;
#[cfg(target_os = "windows")]
{
let check_chrome = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chrome.exe"])
.output()
.await;
if let Ok(output) = check_chrome {
let stdout = String::from_utf8_lossy(&output.stdout);
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
if chrome_count > 0 {
logger::log_warn(&format!(" [6/12] ⚠️ {} Chrome processes still running!", chrome_count)).await;
} else {
logger::log_info(" [6/12] ✓ No Chrome processes running").await;
}
}
}
// Check shutdown again
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown requested during cleanup, aborting reset").await;
return Ok(());
}
// Step 6: Recreate proxy pool (if VPN rotation is enabled)
logger::log_info(" [6/10] Recreating proxy pool...").await;
// ===== STEP 7: RECREATE PROXY POOL =====
logger::log_info(" [7/12] Recreating proxy pool...").await;
let new_proxy_pool = if config.enable_vpn_rotation {
match recreate_proxy_pool_with_fresh_credentials(config, paths, monitoring, shutdown_flag).await {
Ok(pool) => {
logger::log_info(&format!(
" ✓ Proxy pool created with {} proxies",
" [7/12] ✓ Proxy pool created with {} proxies",
pool.num_proxies()
)).await;
Some(pool)
}
Err(e) => {
logger::log_warn(&format!(
" ⚠️ Proxy creation failed: {}. Continuing without proxies.",
" [7/12] ⚠️ Proxy creation failed: {}. Continuing without proxies.",
e
)).await;
None
}
}
} else {
logger::log_info(" ⊘ VPN rotation disabled, skipping proxy pool").await;
logger::log_info(" [7/12] ⊘ VPN rotation disabled, skipping proxy pool").await;
None
};
// Step 7: Recreate ChromeDriver pool
logger::log_info(" [7/10] Recreating ChromeDriver pool...").await;
// ===== STEP 8: RECREATE CHROMEDRIVER POOL =====
logger::log_info(" [8/12] Recreating ChromeDriver pool...").await;
let new_pool = Arc::new(
ChromeDriverPool::new_with_proxy_and_task_limit(
new_proxy_pool,
@@ -120,20 +204,24 @@ pub async fn perform_hard_reset(
).await?
);
logger::log_info(" ✓ ChromeDriver pool created").await;
logger::log_info(&format!(
" [8/12] ✓ ChromeDriver pool created with {} instances",
new_pool.get_number_of_instances()
)).await;
// Step 8: Reset the error counter on the NEW pool
logger::log_info(" [8/10] Resetting error counter...").await;
// ===== STEP 9: RESET ERROR COUNTER =====
logger::log_info(" [9/12] Resetting error counter...").await;
new_pool.get_reset_controller().reset();
logger::log_info(" ✓ Error counter cleared").await;
logger::log_info(" [9/12] ✓ Error counter cleared").await;
// Step 9: Replace pool atomically
logger::log_info(" [9/10] Activating new pool...").await;
// ===== STEP 10: REPLACE POOL ATOMICALLY =====
logger::log_info(" [10/12] Activating new pool...").await;
*pool_guard = new_pool;
drop(pool_guard);
logger::log_info(" [10/12] ✓ New pool activated").await;
// Step 10: Emit monitoring event
logger::log_info(" [10/10] Updating monitoring...").await;
// ===== STEP 11: EMIT MONITORING EVENT =====
logger::log_info(" [11/12] Updating monitoring...").await;
if let Some(mon) = monitoring {
mon.emit(crate::monitoring::MonitoringEvent::PoolInitialized {
pool_size: config.max_parallel_instances,
@@ -142,12 +230,40 @@ pub async fn perform_hard_reset(
});
}
// ===== STEP 12: FINAL VERIFICATION =====
logger::log_info(" [12/12] Final verification...").await;
#[cfg(target_os = "windows")]
{
let check_chrome = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chrome.exe"])
.output()
.await;
if let Ok(output) = check_chrome {
let stdout = String::from_utf8_lossy(&output.stdout);
let chrome_count = stdout.lines().filter(|line| line.contains("chrome.exe")).count();
logger::log_info(&format!(" [12/12] Chrome processes: {}", chrome_count)).await;
}
let check_chromedriver = tokio::process::Command::new("tasklist")
.args(["/FI", "IMAGENAME eq chromedriver.exe"])
.output()
.await;
if let Ok(output) = check_chromedriver {
let stdout = String::from_utf8_lossy(&output.stdout);
let chromedriver_count = stdout.lines().filter(|line| line.contains("chromedriver.exe")).count();
logger::log_info(&format!(" [12/12] ChromeDriver processes: {}", chromedriver_count)).await;
}
}
logger::log_info("✅ HARD RESET COMPLETE").await;
Ok(())
}
/// Recreate proxy pool with fresh VPNBook credentials (matches main.rs pattern)
/// ✅ FIXED: Recreate proxy pool with temp pool that's properly shut down
async fn recreate_proxy_pool_with_fresh_credentials(
config: &Config,
paths: &DataPaths,
@@ -162,9 +278,9 @@ async fn recreate_proxy_pool_with_fresh_credentials(
return Err(anyhow::anyhow!("Shutdown requested during proxy recreation"));
}
logger::log_info(" [6.1] Creating temporary ChromeDriver pool for credential fetch...").await;
logger::log_info(" [7.1] Creating temporary ChromeDriver pool for credential fetch...").await;
// Create temporary pool WITHOUT proxy (just like main.rs does)
// Create temporary pool WITHOUT proxy
let temp_pool = Arc::new(
ChromeDriverPool::new_with_proxy_and_task_limit(
None, // No proxy for temp pool
@@ -173,19 +289,41 @@ async fn recreate_proxy_pool_with_fresh_credentials(
).await?
);
logger::log_info(" [6.2] Fetching fresh VPNBook credentials...").await;
logger::log_info(" [7.2] Fetching fresh VPNBook credentials...").await;
// Fetch fresh VPNBook credentials (just like main.rs does)
// Fetch fresh VPNBook credentials
let (username, password, _files) = crate::util::opnv::fetch_vpnbook_configs(
&temp_pool,
paths.cache_dir()
).await?;
logger::log_info(&format!(" [6.3] Got credentials → User: {}", username)).await;
logger::log_info(&format!(" [7.3] Got credentials → User: {}", username)).await;
// Shutdown temp pool
logger::log_info(" [6.4] Shutting down temporary pool...").await;
temp_pool.shutdown().await.ok();
// ✅ FIXED: Properly shutdown temp pool with error handling
logger::log_info(" [7.4] Shutting down temporary pool...").await;
match temp_pool.shutdown().await {
Ok(()) => {
logger::log_info(" [7.4] ✓ Temp pool shut down successfully").await;
}
Err(e) => {
logger::log_error(&format!(" [7.4] ✗ Temp pool shutdown error: {}", e)).await;
// Force-kill processes as backup
#[cfg(target_os = "windows")]
{
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
let _ = tokio::process::Command::new("taskkill")
.args(["/F", "/IM", "chromedriver.exe"])
.output()
.await;
}
}
}
// Wait a moment for temp pool cleanup
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// Check shutdown again
if shutdown_flag.load(Ordering::SeqCst) {
@@ -202,12 +340,12 @@ async fn recreate_proxy_pool_with_fresh_credentials(
}
logger::log_info(&format!(
" [6.5] Found {} VPN servers → Creating proxy pool with {} instances per server...",
" [7.5] Found {} VPN servers → Creating proxy pool with {} instances per server...",
server_count,
number_proxy_instances
)).await;
// Create new proxy pool (just like main.rs does)
// Create new proxy pool
let proxy_pool = Arc::new(
DockerVpnProxyPool::new(
paths.cache_openvpn_dir(),
@@ -218,7 +356,7 @@ async fn recreate_proxy_pool_with_fresh_credentials(
);
logger::log_info(&format!(
" [6.6] ✓ Proxy pool ready with {} total proxies",
" [7.6] ✓ Proxy pool ready with {} total proxies",
proxy_pool.num_proxies()
)).await;

View File

@@ -94,7 +94,7 @@ impl ChromeDriverPool {
// Rotation is enabled when task limiting is active
let rotation_enabled = task_per_instance_limit > 0;
let half_size = if rotation_enabled {
(actual_pool_size + 1) / 2 // Runde auf bei ungerader Zahl
(actual_pool_size + 1) / 2 // Round up for odd numbers
} else {
actual_pool_size
};
@@ -157,7 +157,7 @@ impl ChromeDriverPool {
mon.emit(crate::monitoring::MonitoringEvent::InstanceCreated {
instance_id: i,
max_tasks: guard.max_tasks_per_instance,
proxy: proxy_info.clone(), // ✅ Now includes actual proxy info
proxy: proxy_info.clone(),
});
// Also emit ProxyConnected event if proxy exists
@@ -525,17 +525,43 @@ impl ChromeDriverPool {
}
/// Gracefully shut down all ChromeDriver processes and Docker proxy containers.
/// ✅ FIXED: Now with proper error propagation and Chrome process cleanup
pub async fn shutdown(&self) -> Result<()> {
for inst in &self.instances {
logger::log_info(&format!("Shutting down {} ChromeDriver instances...", self.instances.len())).await;
let mut shutdown_errors = Vec::new();
for (i, inst) in self.instances.iter().enumerate() {
logger::log_info(&format!(" Shutting down instance {}...", i)).await;
let mut guard = inst.lock().await;
guard.shutdown().await?;
if let Err(e) = guard.shutdown().await {
logger::log_error(&format!(" ✗ Instance {} shutdown error: {}", i, e)).await;
shutdown_errors.push(format!("Instance {}: {}", i, e));
} else {
logger::log_info(&format!(" ✓ Instance {} shut down", i)).await;
}
}
if let Some(pp) = &self.proxy_pool {
pp.shutdown().await?;
crate::util::logger::log_info("All Docker VPN proxy containers stopped").await;
logger::log_info("Shutting down proxy pool...").await;
if let Err(e) = pp.shutdown().await {
logger::log_error(&format!("Proxy pool shutdown error: {}", e)).await;
shutdown_errors.push(format!("Proxy pool: {}", e));
} else {
logger::log_info("✓ Proxy pool shut down").await;
}
}
if !shutdown_errors.is_empty() {
return Err(anyhow!(
"Pool shutdown completed with {} error(s): {}",
shutdown_errors.len(),
shutdown_errors.join("; ")
));
}
logger::log_info("✓ All ChromeDriver instances shut down successfully").await;
Ok(())
}
@@ -571,11 +597,14 @@ pub struct ChromeInstance {
session_request_count: Arc<Mutex<usize>>,
max_requests_per_session: usize,
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Referernce to the proxy pool
proxy_pool: Option<Arc<DockerVpnProxyPool>>, // Reference to the proxy pool
current_proxy_index: Arc<Mutex<usize>>, // Current proxy index in use
instance_id: usize,
monitoring: Option<crate::monitoring::MonitoringHandle>,
// ✅ NEW: Track Chrome browser PID for proper cleanup
chrome_pid: Arc<Mutex<Option<u32>>>,
}
impl ChromeInstance {
@@ -605,16 +634,17 @@ impl ChromeInstance {
instance_id,
monitoring,
chrome_pid: Arc::new(Mutex::new(None)),
})
}
pub async fn get_or_renew_session(&self) -> Result<Client> {
pub async fn get_or_renew_session(&mut self) -> Result<Client> {
let mut session_opt = self.current_session.lock().await;
let mut request_count = self.session_request_count.lock().await;
// Session erneuern wenn:
// 1. Keine Session vorhanden
// 2. Request-Limit erreicht
// Session renewal conditions:
// 1. No session exists
// 2. Request limit reached
let needs_renewal = session_opt.is_none() || *request_count >= self.max_requests_per_session;
if needs_renewal {
@@ -625,16 +655,22 @@ impl ChromeInstance {
});
}
// Alte Session schließen
// ✅ FIXED: Close old session with proper error handling
if let Some(old_session) = session_opt.take() {
crate::util::logger::log_info("Closing old session").await;
let _ = old_session.close().await;
// Kurze Pause zwischen Sessions
// Try to close gracefully first
if let Err(e) = old_session.close().await {
logger::log_warn(&format!("Session close failed (may leave Chrome tabs open): {}", e)).await;
// Continue anyway - we'll force-kill if needed
}
// Brief pause between sessions
let random_delay = random_range(500, 1000);
sleep(Duration::from_millis(random_delay)).await;
}
// Neue Session mit frischem User-Agent erstellen
// Create new session with fresh User-Agent
crate::util::logger::log_info(&format!(
"Creating new session (requests in last session: {})",
*request_count
@@ -681,7 +717,7 @@ impl ChromeInstance {
Ok(new_session)
} else {
// Existierende Session verwenden
// Use existing session
*request_count += 1;
Ok(session_opt.as_ref().unwrap().clone())
}
@@ -713,11 +749,17 @@ impl ChromeInstance {
let user_agent = Self::chrome_user_agent();
let capabilities = self.chrome_args_with_ua(user_agent, &proxy_url);
ClientBuilder::native()
let client = ClientBuilder::native()
.capabilities(capabilities)
.connect(&self.base_url)
.await
.context("Failed to connect to ChromeDriver")
.context("Failed to connect to ChromeDriver")?;
// ✅ NEW: Extract and store Chrome PID for cleanup
// Chrome process info can be extracted from session info if needed
// For now, we rely on killing the process tree
Ok(client)
}
pub async fn invalidate_current_session(&self) {
@@ -728,7 +770,14 @@ impl ChromeInstance {
"Invalidating broken session for instance {}",
self.instance_id
)).await;
let _ = old_session.close().await;
// ✅ FIXED: Proper error handling instead of silent failure
if let Err(e) = old_session.close().await {
logger::log_warn(&format!(
"Failed to close broken session (Chrome tabs may remain): {}",
e
)).await;
}
}
let mut request_count = self.session_request_count.lock().await;
@@ -752,14 +801,86 @@ impl ChromeInstance {
self.task_count
}
/// ✅ FIXED: Proper Chrome + ChromeDriver shutdown with process tree killing
pub async fn shutdown(&mut self) -> Result<()> {
logger::log_info(&format!("Shutting down ChromeInstance {}...", self.instance_id)).await;
// Step 1: Close any active session to signal Chrome to close
{
let mut session_opt = self.current_session.lock().await;
if let Some(session) = session_opt.take() {
logger::log_info(" Closing active session...").await;
if let Err(e) = session.close().await {
logger::log_warn(&format!(" Session close failed: {}", e)).await;
}
}
}
// Step 2: Abort stderr logging task
if let Some(handle) = self.stderr_log.take() {
handle.abort();
let _ = handle.await;
}
let _ = self.process.start_kill();
let _ = self.process.wait().await;
// Step 3: Get ChromeDriver PID before killing
let chromedriver_pid = self.process.id();
logger::log_info(&format!(" ChromeDriver PID: {:?}", chromedriver_pid)).await;
// Step 4: Kill ChromeDriver and wait
if let Err(e) = self.process.start_kill() {
logger::log_warn(&format!(" Failed to kill ChromeDriver: {}", e)).await;
}
// Wait for ChromeDriver to exit (with timeout)
match timeout(Duration::from_secs(5), self.process.wait()).await {
Ok(Ok(status)) => {
logger::log_info(&format!(" ChromeDriver exited with status: {:?}", status)).await;
}
Ok(Err(e)) => {
logger::log_warn(&format!(" Error waiting for ChromeDriver: {}", e)).await;
}
Err(_) => {
logger::log_warn(" ChromeDriver didn't exit within 5s").await;
}
}
// Step 5: ✅ CRITICAL FIX: Force-kill Chrome process tree
// On Windows, Chrome doesn't die when ChromeDriver dies
if let Some(pid) = chromedriver_pid {
logger::log_info(&format!(" Force-killing Chrome process tree for PID {}...", pid)).await;
#[cfg(target_os = "windows")]
{
// Kill entire process tree on Windows
let _ = Command::new("taskkill")
.args(["/F", "/T", "/PID", &pid.to_string()])
.output()
.await;
// Also kill any remaining chrome.exe processes
let _ = Command::new("taskkill")
.args(["/F", "/IM", "chrome.exe"])
.output()
.await;
}
#[cfg(not(target_os = "windows"))]
{
// Kill process group on Unix
let _ = Command::new("pkill")
.args(["-P", &pid.to_string()])
.output()
.await;
}
logger::log_info(" ✓ Chrome process tree killed").await;
}
// Step 6: Wait a moment for processes to fully terminate
sleep(Duration::from_millis(500)).await;
logger::log_info(&format!("✓ ChromeInstance {} shut down", self.instance_id)).await;
Ok(())
}
@@ -869,6 +990,24 @@ impl ChromeInstance {
}
}
impl Drop for ChromeInstance {
fn drop(&mut self) {
// Signal both ChromeDriver and Chrome to terminate
let _ = self.process.start_kill();
// Also try to kill Chrome if we know the PID
if let Some(pid) = self.process.id() {
#[cfg(target_os = "windows")]
{
// Fire and forget - this is best-effort cleanup
let _ = std::process::Command::new("taskkill")
.args(["/F", "/T", "/PID", &pid.to_string()])
.output();
}
}
}
}
fn parse_chromedriver_address(line: &str) -> Option<String> {
if line.contains("Starting ChromeDriver") {
if let Some(port_str) = line.split("on port ").nth(1) {
@@ -889,14 +1028,6 @@ fn parse_chromedriver_address(line: &str) -> Option<String> {
None
}
impl Drop for ChromeInstance {
fn drop(&mut self) {
// Signal child to terminate. Do NOT block here; shutdown should be
// performed with the async `shutdown()` method when possible.
let _ = self.process.start_kill();
}
}
/// Simplified task execution - uses the pool pattern.
pub struct ScrapeTask<T> {
url: String,