From c56fcfdd72311dea44ca58001e5848c76ab0cb19 Mon Sep 17 00:00:00 2001 From: donpat1to Date: Mon, 17 Nov 2025 15:20:09 +0100 Subject: [PATCH] persistent scraping --- src/main.rs | 646 +++++++++++++++++++++++++--------------------------- 1 file changed, 310 insertions(+), 336 deletions(-) diff --git a/src/main.rs b/src/main.rs index 4092d00..ec520a5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,15 @@ -use chrono::{NaiveDate}; +use chrono::{NaiveDate, Datelike}; use fantoccini::{ClientBuilder, Locator}; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; -use std::{process::Command}; -use tokio::{time::{Duration, sleep}, signal}; +use std::{path::PathBuf, process::Command}; +use tokio::{ + fs, + signal, + time::{sleep, Duration}, +}; -#[derive(Debug, Serialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)] struct EconomicEvent { country: String, date: String, @@ -18,6 +22,14 @@ struct EconomicEvent { description: String, } +#[derive(Debug)] +struct ChunkInfo { + start_date: String, + end_date: String, + path: PathBuf, + event_count: usize, +} + fn start_chromedriver(port: u16) -> std::process::Child { Command::new("chromedriver-win64/chromedriver.exe") .args(&[format!("--port={}", port)]) @@ -26,10 +38,10 @@ fn start_chromedriver(port: u16) -> std::process::Child { } async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> { - // Single strategy: wait for and remove iframe for _ in 0..10 { - let removed: bool = client.execute( - r#"(() => { + let removed: bool = client + .execute( + r#"(() => { const iframe = document.querySelector('iframe[title="Contentpass First Layer"]'); if (iframe && iframe.parentNode) { iframe.parentNode.removeChild(iframe); @@ -37,18 +49,23 @@ async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> { } return false; })()"#, - vec![] - ).await?.as_bool().unwrap_or(false); - - if removed { break; } + vec![], + ) + .await? + .as_bool() + .unwrap_or(false); + + if removed { + break; + } sleep(Duration::from_millis(500)).await; } Ok(()) } -async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result> { - println!("Extracting ONLY 3-star events via JavaScript..."); - +async fn extract_all_data_via_js( + client: &fantoccini::Client, +) -> anyhow::Result> { let extraction_script = r#" const events = []; let currentDate = ''; @@ -60,29 +77,23 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result< const cells = row.querySelectorAll('td'); if (cells.length === 1 && cells[0].colSpan === 9) { - // This is a date header row - extract and parse the date const dateText = cells[0].textContent.trim(); - console.log('Found date header:', dateText); - // Convert German date to ISO format (YYYY-MM-DD) const monthMap = { 'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04', 'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08', 'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12' }; - // Extract date parts from German format "Montag, 30. April 2007" - const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})/); + const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-ZäöüßÄÖÜ]+)\s+(\d{4})/); if (dateParts) { const day = dateParts[1].padStart(2, '0'); const germanMonth = dateParts[2]; const year = dateParts[3]; const month = monthMap[germanMonth] || '01'; currentDate = `${year}-${month}-${day}`; - console.log('Converted date:', currentDate, 'from:', dateText); } else { - console.log('Failed to parse date:', dateText); - currentDate = ''; // Reset if parsing fails + currentDate = ''; } continue; } @@ -94,11 +105,9 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result< if (!time || !country || !eventName) continue; - // Count ONLY YELLOW stars (high importance) const importanceCell = cells[3]; const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0; - // STRICT FILTER: Only include events with EXACTLY 3 YELLOW stars if (yellowStarCount === 3) { let description = ''; if (i + 1 < rows.length) { @@ -114,7 +123,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result< events.push({ country: country, - date: currentDate, // Now using ISO format date + date: currentDate, time: time, event: eventName, actual: cells[7]?.textContent?.trim() || '', @@ -127,64 +136,79 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result< } } - console.log('Total events extracted:', events.length); - if (events.length > 0) { - console.log('First event date:', events[0].date); - console.log('Last event date:', events[events.length - 1].date); - } - return events; "#; - + let result = client.execute(extraction_script, vec![]).await?; - - // Parse the JSON result into EconomicEvent structs + if let Some(events_array) = result.as_array() { let mut events = Vec::new(); for event_value in events_array { if let Some(event_obj) = event_value.as_object() { let event = EconomicEvent { - country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(), - date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(), - time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(), - event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(), - actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(), - forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(), - previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(), - importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(), - description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(), + country: event_obj + .get("country") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + date: event_obj + .get("date") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + time: event_obj + .get("time") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + event: event_obj + .get("event") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + actual: event_obj + .get("actual") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + forecast: event_obj + .get("forecast") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + previous: event_obj + .get("previous") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + importance: event_obj + .get("importance") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + description: event_obj + .get("description") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), }; events.push(event); } } - println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len()); - - // Debug: show date range of extracted events - if !events.is_empty() { - let dates: Vec<&str> = events.iter().map(|e| e.date.as_str()).filter(|d| !d.is_empty()).collect(); - if !dates.is_empty() { - let min_date = dates.iter().min().unwrap_or(&"N/A"); - let max_date = dates.iter().max().unwrap_or(&"N/A"); - println!("📅 Extracted date range: {} to {}", min_date, max_date); - - // Show sample of dates for debugging - println!("Sample dates:"); - for (i, date) in dates.iter().take(5).enumerate() { - println!(" {}. {}", i + 1, date); - } - } else { - println!("❌ No valid dates found in extracted events"); - } - } - + println!("Extracted {} events (3 YELLOW stars ONLY)", events.len()); return Ok(events); } - + Ok(vec![]) } -async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> { - let set_dates_script = format!(r#" +async fn set_date_range( + client: &fantoccini::Client, + start: &str, + end: &str, +) -> anyhow::Result<()> { + let set_dates_script = format!( + r#" (() => {{ const fromInput = document.querySelector('#dtTeletraderFromDate'); const toInput = document.querySelector('#dtTeletraderEndDate'); @@ -203,62 +227,22 @@ async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> return !!fromInput && !!toInput; }})() - "#, start, end); + "#, + start, end + ); client.execute(&set_dates_script, vec![]).await?; - sleep(Duration::from_millis(1000)).await; // Wait for table to update + sleep(Duration::from_millis(1000)).await; - // Now read the values - let from_date_value: String = client.execute( - r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#, - vec![], - ).await?.as_str().unwrap_or_default().to_string(); - - let to_date_value: String = client.execute( - r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#, - vec![], - ).await?.as_str().unwrap_or_default().to_string(); - - if from_date_value == start && to_date_value == end { - println!(" Dates set correctly"); - } else { - println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}", - start, end, from_date_value, to_date_value); - } - Ok(()) } -fn parse_any_date(date: &str) -> Option { - // Attempt ISO first - if let Ok(d) = NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d") { - return Some(d); - } - - // Convert German → English once - let month_map = [ - ("Januar", "January"), ("Februar", "February"), ("März", "March"), - ("April", "April"), ("Mai", "May"), ("Juni", "June"), - ("Juli", "July"), ("August", "August"), ("September", "September"), - ("Oktober", "October"), ("November", "November"), ("Dezember", "December"), - ]; - - let mut english = date.to_string(); - for (de, en) in month_map { - english = english.replace(de, en); - } - - // Try two formats max - NaiveDate::parse_from_str(&english, "%A, %d. %B %Y") - .or_else(|_| NaiveDate::parse_from_str(&english, "%d. %B %Y")) - .ok() +fn parse_date(date: &str) -> Option { + NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d").ok() } fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result { - let mut dates: Vec<_> = events - .iter() - .filter_map(|e| parse_any_date(&e.date)) - .collect(); + let mut dates: Vec<_> = events.iter().filter_map(|e| parse_date(&e.date)).collect(); if dates.is_empty() { return Err(anyhow::anyhow!("No parseable dates found")); @@ -270,242 +254,220 @@ fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result Ok(next.format("%Y-%m-%d").to_string()) } -fn extract_month(date: &str) -> Option { - parse_any_date(date).map(|d| d.format("%B").to_string()) -} - -fn count_valid_times(events: &[EconomicEvent]) -> usize { - events.iter().filter(|e| { - e.time.len() == 5 && - e.time.as_bytes()[2] == b':' && - e.time[..2].chars().all(|c| c.is_ascii_digit()) && - e.time[3..].chars().all(|c| c.is_ascii_digit()) - }).count() -} - -fn missing_critical(e: &EconomicEvent) -> bool { - e.event.trim().is_empty() || e.time.trim().is_empty() -} - -fn is_complete(e: &EconomicEvent) -> bool { - !(e.event.trim().is_empty() || - e.time.trim().is_empty() || - e.country.trim().is_empty()) && - (!e.actual.trim().is_empty() || - !e.forecast.trim().is_empty() || - !e.previous.trim().is_empty()) -} - -async fn check_data_consistency(events: &[EconomicEvent]) { - println!("\n=== DATA CONSISTENCY CHECKS ==="); - - println!("⏰ Valid time formats: {}/{}", count_valid_times(events), events.len()); - - let missing: Vec<_> = events.iter().enumerate() - .filter(|(_, e)| missing_critical(e)) - .collect(); - - if !missing.is_empty() { - println!("❌ {} events missing critical fields", missing.len()); - } -} - -async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> { - println!("\n=== EVENT VALIDATION ==="); - - if events.is_empty() { - println!("❌ ERROR: No events extracted!"); - return Ok(()); +/// Scan the economic_events directory for existing chunks +async fn scan_existing_chunks() -> anyhow::Result> { + let events_dir = PathBuf::from("economic_events"); + + // Create directory if it doesn't exist + if !events_dir.exists() { + fs::create_dir_all(&events_dir).await?; + println!("📁 Created economic_events directory"); + return Ok(vec![]); } - println!("📊 Total events: {}", events.len()); + let mut chunks = Vec::new(); + let mut entries = fs::read_dir(&events_dir).await?; - // 1. Description coverage - let desc_count = events.iter() - .filter(|e| !e.description.trim().is_empty()) - .count(); - - println!("📝 Events with descriptions: {}/{}", desc_count, events.len()); - - // 2. Distributions - use std::collections::HashMap; - let mut country_dist: HashMap = HashMap::new(); - let mut month_dist: HashMap = HashMap::new(); - - for e in events { - *country_dist.entry(e.country.clone()).or_insert(0) += 1; - - if let Some(month) = extract_month(&e.date) { - *month_dist.entry(month).or_insert(0) += 1; + while let Some(entry) = entries.next_entry().await? { + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("json") { + if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) { + // Parse filename: chunk_{startdate}_{enddate}.json + if let Some(dates) = filename.strip_prefix("chunk_") { + let parts: Vec<&str> = dates.split('_').collect(); + if parts.len() == 2 { + // Load and count events + if let Ok(content) = fs::read_to_string(&path).await { + if let Ok(events) = serde_json::from_str::>(&content) { + chunks.push(ChunkInfo { + start_date: parts[0].to_string(), + end_date: parts[1].to_string(), + path: path.clone(), + event_count: events.len(), + }); + } + } + } + } + } } } - println!("🌍 Country distribution: {:?}", country_dist); - println!("📈 Month distribution: {:?}", month_dist); - - // 3. Sample events (first 5) - println!("\n🔍 Sample events (first 5):"); - for event in events.iter().take(5) { - println!( - " • {} {}: {} - {} (Importance: {})", - event.date, event.time, event.country, event.event, event.importance - ); + chunks.sort_by(|a, b| a.start_date.cmp(&b.start_date)); + + if !chunks.is_empty() { + println!("\n📊 Found {} existing chunks:", chunks.len()); + for chunk in &chunks { + println!(" • {} to {} ({} events)", + chunk.start_date, chunk.end_date, chunk.event_count); + } + } else { + println!("📭 No existing chunks found"); } - // 4. Completeness check - let complete_count = events.iter().filter(|e| is_complete(e)).count(); - println!( - "✅ Complete events: {}/{}", - complete_count, - events.len() - ); + Ok(chunks) +} - // 5. Date range - let (earliest, latest) = calculate_actual_date_range(events); - println!("📅 Actual date range: {} to {}", earliest, latest); +/// Calculate target end date: first day of month, 3 months from now +fn calculate_target_end_date() -> String { + let now = chrono::Local::now().naive_local().date(); + let three_months_ahead = if now.month() + 3 > 12 { + NaiveDate::from_ymd_opt(now.year() + 1, (now.month() + 3) % 12, 1) + } else { + NaiveDate::from_ymd_opt(now.year(), now.month() + 3, 1) + }.unwrap(); + + three_months_ahead.format("%Y-%m-%d").to_string() +} - // Final summary - println!("\n=== VALIDATION SUMMARY ==="); - println!(" • Total events: {}", events.len()); - println!( - " • Events with descriptions [%]: {}", - (desc_count * 100) / events.len().max(1) - ); - println!( - " • Complete events [%]: {}", - (complete_count * 100) / events.len().max(1) - ); - println!(" • Date range: {} to {}", earliest, latest); +/// Determine what date range needs to be scraped based on existing data +fn determine_scrape_range(chunks: &[ChunkInfo], target_end: &str) -> Option<(String, String)> { + let now = chrono::Local::now().naive_local().date().format("%Y-%m-%d").to_string(); + + if chunks.is_empty() { + // No data exists, start from beginning + println!("📭 No existing data - scraping from 2007-02-13 to {}", target_end); + return Some(("2007-02-13".to_string(), target_end.to_string())); + } + + // Find the latest date in existing chunks + let latest_chunk_date = chunks.iter() + .map(|c| &c.end_date) + .max() + .cloned() + .unwrap_or_else(|| "2007-02-13".to_string()); + + println!("📊 Latest existing data: {}", latest_chunk_date); + + if latest_chunk_date >= now { + // Data is ahead of current date - update from now to target + println!("🔄 Data exists beyond today - updating from {} to {}", now, target_end); + Some((now, target_end.to_string())) + } else { + // Data is behind - continue from where it left off + let next_start = parse_date(&latest_chunk_date) + .and_then(|d| d.succ_opt()) + .map(|d| d.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| latest_chunk_date.clone()); + + println!("➡️ Continuing from {} to {}", next_start, target_end); + Some((next_start, target_end.to_string())) + } +} +/// Save a chunk to disk +async fn save_chunk(events: &[EconomicEvent], start: &str, end: &str) -> anyhow::Result<()> { + let events_dir = PathBuf::from("economic_events"); + fs::create_dir_all(&events_dir).await?; + + let filename = format!("chunk_{}_{}.json", start, end); + let filepath = events_dir.join(&filename); + + let json = serde_json::to_string_pretty(events)?; + fs::write(&filepath, json).await?; + + println!("💾 Saved chunk: {} ({} events)", filename, events.len()); Ok(()) } -fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) { - if events.is_empty() { - return ("No data".to_string(), "No data".to_string()); +/// Load all events from existing chunks +async fn load_all_events(chunks: &[ChunkInfo]) -> anyhow::Result> { + let mut all_events = Vec::new(); + + for chunk in chunks { + if let Ok(content) = fs::read_to_string(&chunk.path).await { + if let Ok(events) = serde_json::from_str::>(&content) { + all_events.extend(events); + } + } } - - let mut dates: Vec = events - .iter() - .filter_map(|e| { - // Parse German date format "Dienstag, 2. Januar 2024" - extract_date_from_german_format(&e.date) - }) - .collect(); - - dates.sort(); - - let earliest = dates.first().map(|d| d.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "Unknown".to_string()); - let latest = dates.last().map(|d| d.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "Unknown".to_string()); - - (earliest, latest) + + println!("📥 Loaded {} events from existing chunks", all_events.len()); + Ok(all_events) } -fn extract_date_from_german_format(german_date: &str) -> Option { - // Map German month names to English - let month_map = [ - ("Januar", "January"), - ("Februar", "February"), - ("März", "March"), - ("April", "April"), - ("Mai", "May"), - ("Juni", "June"), - ("Juli", "July"), - ("August", "August"), - ("September", "September"), - ("Oktober", "October"), - ("November", "November"), - ("Dezember", "December"), - ]; +/// Scrape events for a specific date range and save chunks immediately +async fn scrape_date_range( + client: &fantoccini::Client, + start: &str, + end: &str, +) -> anyhow::Result<()> { + println!("\n🎯 Scraping range: {} to {}", start, end); - let mut english_date = german_date.to_string(); - for (de, en) in &month_map { - english_date = english_date.replace(de, en); - } - - // Parse "Tuesday, 2. January 2024" format - NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok() -} - -async fn scrape_all_events_with_chunking( - client: &fantoccini::Client, - start_date: &str, - end_date: &str -) -> anyhow::Result> { - let json_export_now = chrono::Local::now().format("%Y%m%d_%H%M%S"); - - let mut all_events: Vec = Vec::new(); - let mut current_start = start_date.to_string(); - let mut attempts = 0; - let max_attempts = 300; + let mut current_start = start.to_string(); + let mut chunk_number = 0; loop { - attempts += 1; - if attempts > max_attempts { - println!("⚠️ Reached maximum attempts ({})", max_attempts); - break; - } - - println!("🚀 Chunk {}: {} to {}", attempts, current_start, end_date); - - set_date_range(client, ¤t_start, end_date).await?; + set_date_range(client, ¤t_start, end).await?; sleep(Duration::from_secs(3)).await; - let chunk = extract_all_data_via_js(client).await?; - if chunk.is_empty() { - println!("✅ No more events found. Completed!"); + let events = extract_all_data_via_js(client).await?; + if events.is_empty() { + println!(" ✅ No more events in this range"); break; } - println!("📊 Chunk {}: {} events (Total: {})", - attempts, chunk.len(), all_events.len() + chunk.len()); + chunk_number += 1; + println!(" 📦 Fetched {} events", events.len()); - all_events.extend(chunk.clone()); + // Calculate actual date range of this chunk + let chunk_start = events.iter() + .filter_map(|e| parse_date(&e.date)) + .min() + .map(|d| d.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| current_start.clone()); + + let chunk_end = events.iter() + .filter_map(|e| parse_date(&e.date)) + .max() + .map(|d| d.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| end.to_string()); - let next = match calculate_next_start_date(&chunk) { + // Save chunk immediately + save_chunk(&events, &chunk_start, &chunk_end).await?; + + let next = match calculate_next_start_date(&events) { Ok(n) => n, Err(_) => { - println!("❌ Could not calculate next start date. Stopping."); + println!(" ⚠️ Cannot calculate next date, stopping"); break; } }; - if next > end_date.to_string() { - println!("✅ Reached end date. Completed!"); + if next > end.to_string() { + println!(" ✅ Reached end of range"); break; } current_start = next; - sleep(Duration::from_secs(2)).await; - - export_chunk(&chunk, attempts, json_export_now.to_string().clone()).await?; } - - // Remove duplicates - let initial_count = all_events.len(); - all_events.sort_by(|a, b| { - a.date.cmp(&b.date) - .then(a.time.cmp(&b.time)) - .then(a.event.cmp(&b.event)) - }); - all_events.dedup_by(|a, b| { - a.date == b.date && a.time == b.time && a.event == b.event - }); - - println!("🎯 FINAL: Collected {} unique events (removed {} duplicates)", - all_events.len(), initial_count - all_events.len()); - - Ok(all_events) + + Ok(()) } -async fn export_chunk(chunk: &[EconomicEvent], n: usize, ts: String) -> anyhow::Result<()> { - let filename = format!("economic_events_{}_chunk_{}.json", ts, n); - let json = serde_json::to_string_pretty(chunk)?; - tokio::fs::write(&filename, json).await?; - println!(" Chunk data exported to: {}", filename); +/// Main scraping logic with persistent storage +async fn scrape_with_persistence( + client: &fantoccini::Client, +) -> anyhow::Result<()> { + // Calculate target end date (3 months ahead, 1st of month) + let target_end = calculate_target_end_date(); + println!("🎯 Target end date: {}", target_end); + + // Scan for existing chunks + let existing_chunks = scan_existing_chunks().await?; + + // Determine what range needs to be scraped + let scrape_range = determine_scrape_range(&existing_chunks, &target_end); + + if let Some((start, end)) = scrape_range { + // Scrape the needed range (saves chunks automatically) + scrape_date_range(client, &start, &end).await?; + println!("\n✅ Scraping complete!"); + } else { + println!("✅ All data is up to date!"); + } + Ok(()) } @@ -515,20 +477,20 @@ async fn main() -> anyhow::Result<()> { let mut chromedriver = start_chromedriver(port); sleep(Duration::from_secs(1)).await; - // Chrome options let caps_value = serde_json::json!({ "goog:chromeOptions": { "args": [ "--disable-gpu", "--disable-notifications", "--disable-popup-blocking", - "--disable-blink-features=AutomationControlled" + "--disable-blink-features=AutomationControlled", ], "excludeSwitches": ["enable-automation"] } }); - let caps_map: Map = caps_value.as_object() + let caps_map: Map = caps_value + .as_object() .expect("Capabilities should be a JSON object") .clone(); @@ -537,48 +499,60 @@ async fn main() -> anyhow::Result<()> { .connect(&format!("http://localhost:{}", port)) .await?; - // Setup graceful shutdown on Ctrl+C + // Setup graceful shutdown let shutdown_client = client.clone(); - let shutdown_handle = tokio::spawn(async move { - signal::ctrl_c().await.expect("Failed to listen for ctrl+c"); + tokio::spawn(async move { + signal::ctrl_c() + .await + .expect("Failed to listen for ctrl+c"); println!("\nCtrl+C received, shutting down..."); shutdown_client.close().await.ok(); - chromedriver.kill().ok(); std::process::exit(0); }); - // Go to page + // Navigate to page let url = "https://www.finanzen.net/termine/wirtschaftsdaten/"; client.goto(url).await?; - // Dismiss overlays dismiss_overlays(&client).await?; - // Click the high importance tab - if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await { + // Click high importance tab + if let Ok(tab) = client + .find(Locator::Css( + r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#, + )) + .await + { tab.click().await?; - println!("High importance tab clicked"); + println!("✓ High importance tab clicked"); sleep(Duration::from_secs(2)).await; - } else { - println!("High importance tab not found"); } - // Use chunking to extract all events across the entire date range - let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?; + // Run persistent scraping + scrape_with_persistence(&client).await?; - // Run validation suite - validate_events(&events).await?; - check_data_consistency(&events).await; - - // Export for further analysis - if let Ok(json) = serde_json::to_string_pretty(&events) { - let filename = format!("economic_events_{}_combined.json", chrono::Local::now().format("%Y%m%d_%H%M%S")); - tokio::fs::write(&filename, json).await?; - println!(" • Combined data exported to: {}", filename); + // Load and display summary + let chunks = scan_existing_chunks().await?; + let all_events = load_all_events(&chunks).await?; + + println!("\n📊 FINAL SUMMARY:"); + println!(" • Total chunks: {}", chunks.len()); + println!(" • Total events: {}", all_events.len()); + + if !chunks.is_empty() { + let dates: Vec = all_events.iter() + .filter_map(|e| parse_date(&e.date)) + .map(|d| d.format("%Y-%m-%d").to_string()) + .collect(); + if !dates.is_empty() { + let min = dates.iter().min().unwrap(); + let max = dates.iter().max().unwrap(); + println!(" • Date range: {} to {}", min, max); + } } - // Wait for Ctrl+C - shutdown_handle.await.ok(); + client.close().await?; + chromedriver.kill()?; Ok(()) } \ No newline at end of file