From 0af0c1e61567dc3e9af01e8cea9ffe70ac7243d8 Mon Sep 17 00:00:00 2001 From: donpat1to Date: Mon, 17 Nov 2025 13:52:41 +0100 Subject: [PATCH] moved functions for date parsing together --- src/main.rs | 507 ++++++++++++++++++++++------------------------------ 1 file changed, 212 insertions(+), 295 deletions(-) diff --git a/src/main.rs b/src/main.rs index 067e80b..4092d00 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,8 @@ -use chrono::{NaiveDate, Duration as ChronoDuration}; +use chrono::{NaiveDate}; use fantoccini::{ClientBuilder, Locator}; use serde::Serialize; use serde_json::{Map, Value}; -use std::{collections::HashMap, process::Command}; +use std::{process::Command}; use tokio::{time::{Duration, sleep}, signal}; #[derive(Debug, Serialize, Clone)] @@ -183,141 +183,201 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result< Ok(vec![]) } -async fn check_data_consistency(events: &[EconomicEvent]) { - println!("\n=== DATA CONSISTENCY CHECKS ==="); - - // Count event name occurrences - let mut event_names: HashMap = HashMap::new(); - for event in events { - *event_names.entry(event.event.clone()).or_insert(0) += 1; - } +async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> { + let set_dates_script = format!(r#" + (() => {{ + const fromInput = document.querySelector('#dtTeletraderFromDate'); + const toInput = document.querySelector('#dtTeletraderEndDate'); - // Detect duplicates - let duplicates: Vec<_> = event_names - .iter() - .filter(|(_, count)| **count > 1) - .collect(); - - if !duplicates.is_empty() { - println!("⚠️ Found {} duplicate event names:", duplicates.len()); - for (name, count) in duplicates.iter().take(5) { - println!(" - '{}' appears {} times", name, count); - } + if (fromInput) {{ + fromInput.value = '{}'; + fromInput.dispatchEvent(new Event('input', {{ bubbles: true }})); + fromInput.dispatchEvent(new Event('change', {{ bubbles: true }})); + }} + + if (toInput) {{ + toInput.value = '{}'; + toInput.dispatchEvent(new Event('input', {{ bubbles: true }})); + toInput.dispatchEvent(new Event('change', {{ bubbles: true }})); + }} + + return !!fromInput && !!toInput; + }})() + "#, start, end); + + client.execute(&set_dates_script, vec![]).await?; + sleep(Duration::from_millis(1000)).await; // Wait for table to update + + // Now read the values + let from_date_value: String = client.execute( + r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#, + vec![], + ).await?.as_str().unwrap_or_default().to_string(); + + let to_date_value: String = client.execute( + r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#, + vec![], + ).await?.as_str().unwrap_or_default().to_string(); + + if from_date_value == start && to_date_value == end { + println!(" Dates set correctly"); } else { - println!("✅ No duplicate event names found"); - } - - // Check time format consistency - let valid_time_format = events.iter() - .filter(|e| { - // Time should be in format "HH:MM" - e.time.len() == 5 && - e.time.chars().nth(2) == Some(':') && - e.time[0..2].chars().all(|c| c.is_ascii_digit()) && - e.time[3..5].chars().all(|c| c.is_ascii_digit()) - }) - .count(); - - println!("⏰ Valid time formats: {}/{}", valid_time_format, events.len()); - - // Check for missing critical data - let critical_fields_missing: Vec<_> = events.iter() - .enumerate() - .filter(|(_, e)| e.event.trim().is_empty() || e.time.trim().is_empty()) - .map(|(i, e)| (i, e)) - .collect(); - - if !critical_fields_missing.is_empty() { - println!("❌ {} events missing critical fields", critical_fields_missing.len()); - } -} - -async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> { - println!("\n=== EVENT VALIDATION ==="); - - // Check if we have any events at all - if events.is_empty() { - println!("❌ ERROR: No events extracted!"); - return Ok(()); - } - - println!("📊 Total events: {}", events.len()); - - // 1. Check date range compliance - let date_range_events: Vec<_> = events.iter() - .filter(|e| { - // Extract year from German date format "Dienstag, 2. Januar 2024" - e.date.contains("2024") || e.date.contains("2025") - }) - .collect(); - - println!("📅 Events in 2024-2025 range: {}/{}", - date_range_events.len(), events.len()); - - // 2. Check importance filtering - let high_importance_count = events.iter() - .filter(|e| e.importance == "High") - .count(); - println!("⭐ High importance events: {}/{}", high_importance_count, events.len()); - - // 3. Check data completeness - let complete_events = events.iter() - .filter(|e| { - !e.event.trim().is_empty() && - !e.time.trim().is_empty() && - !e.country.trim().is_empty() && - (!e.actual.trim().is_empty() || !e.forecast.trim().is_empty() || !e.previous.trim().is_empty()) - }) - .count(); - - println!("✅ Complete events: {}/{}", complete_events, events.len()); - - // 4. Check description coverage - let events_with_descriptions = events.iter() - .filter(|e| !e.description.trim().is_empty()) - .count(); - println!("📝 Events with descriptions: {}/{}", events_with_descriptions, events.len()); - - // 5. Distribution analysis - use std::collections::HashMap; - let mut country_distribution: HashMap = HashMap::new(); - let mut month_distribution: HashMap = HashMap::new(); - - for event in events { - *country_distribution.entry(event.country.clone()).or_insert(0) += 1; - - // Extract month from German date - if let Some(month) = extract_month(&event.date) { - *month_distribution.entry(month).or_insert(0) += 1; - } - } - - println!("🌍 Country distribution: {:?}", country_distribution); - println!("📈 Month distribution: {:?}", month_distribution); - - // 6. Sample output for manual inspection - println!("\n🔍 Sample events (first 5):"); - for event in events.iter().take(5) { - println!(" • {} {}: {} - {} (Importance: {})", - event.date, event.time, event.country, event.event, event.importance); + println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}", + start, end, from_date_value, to_date_value); } Ok(()) } -fn extract_month(date_str: &str) -> Option { - // Extract month from German date format - let months = [ - "Januar", "Februar", "März", "April", "Mai", "Juni", - "Juli", "August", "September", "Oktober", "November", "Dezember" +fn parse_any_date(date: &str) -> Option { + // Attempt ISO first + if let Ok(d) = NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d") { + return Some(d); + } + + // Convert German → English once + let month_map = [ + ("Januar", "January"), ("Februar", "February"), ("März", "March"), + ("April", "April"), ("Mai", "May"), ("Juni", "June"), + ("Juli", "July"), ("August", "August"), ("September", "September"), + ("Oktober", "October"), ("November", "November"), ("Dezember", "December"), ]; - - for month in months { - if date_str.contains(month) { - return Some(month.to_string()); + + let mut english = date.to_string(); + for (de, en) in month_map { + english = english.replace(de, en); + } + + // Try two formats max + NaiveDate::parse_from_str(&english, "%A, %d. %B %Y") + .or_else(|_| NaiveDate::parse_from_str(&english, "%d. %B %Y")) + .ok() +} + +fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result { + let mut dates: Vec<_> = events + .iter() + .filter_map(|e| parse_any_date(&e.date)) + .collect(); + + if dates.is_empty() { + return Err(anyhow::anyhow!("No parseable dates found")); + } + + dates.sort(); + let next = dates.last().unwrap().succ_opt().unwrap(); + + Ok(next.format("%Y-%m-%d").to_string()) +} + +fn extract_month(date: &str) -> Option { + parse_any_date(date).map(|d| d.format("%B").to_string()) +} + +fn count_valid_times(events: &[EconomicEvent]) -> usize { + events.iter().filter(|e| { + e.time.len() == 5 && + e.time.as_bytes()[2] == b':' && + e.time[..2].chars().all(|c| c.is_ascii_digit()) && + e.time[3..].chars().all(|c| c.is_ascii_digit()) + }).count() +} + +fn missing_critical(e: &EconomicEvent) -> bool { + e.event.trim().is_empty() || e.time.trim().is_empty() +} + +fn is_complete(e: &EconomicEvent) -> bool { + !(e.event.trim().is_empty() || + e.time.trim().is_empty() || + e.country.trim().is_empty()) && + (!e.actual.trim().is_empty() || + !e.forecast.trim().is_empty() || + !e.previous.trim().is_empty()) +} + +async fn check_data_consistency(events: &[EconomicEvent]) { + println!("\n=== DATA CONSISTENCY CHECKS ==="); + + println!("⏰ Valid time formats: {}/{}", count_valid_times(events), events.len()); + + let missing: Vec<_> = events.iter().enumerate() + .filter(|(_, e)| missing_critical(e)) + .collect(); + + if !missing.is_empty() { + println!("❌ {} events missing critical fields", missing.len()); + } +} + +async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> { + println!("\n=== EVENT VALIDATION ==="); + + if events.is_empty() { + println!("❌ ERROR: No events extracted!"); + return Ok(()); + } + + println!("📊 Total events: {}", events.len()); + + // 1. Description coverage + let desc_count = events.iter() + .filter(|e| !e.description.trim().is_empty()) + .count(); + + println!("📝 Events with descriptions: {}/{}", desc_count, events.len()); + + // 2. Distributions + use std::collections::HashMap; + let mut country_dist: HashMap = HashMap::new(); + let mut month_dist: HashMap = HashMap::new(); + + for e in events { + *country_dist.entry(e.country.clone()).or_insert(0) += 1; + + if let Some(month) = extract_month(&e.date) { + *month_dist.entry(month).or_insert(0) += 1; } } - None + + println!("🌍 Country distribution: {:?}", country_dist); + println!("📈 Month distribution: {:?}", month_dist); + + // 3. Sample events (first 5) + println!("\n🔍 Sample events (first 5):"); + for event in events.iter().take(5) { + println!( + " • {} {}: {} - {} (Importance: {})", + event.date, event.time, event.country, event.event, event.importance + ); + } + + // 4. Completeness check + let complete_count = events.iter().filter(|e| is_complete(e)).count(); + println!( + "✅ Complete events: {}/{}", + complete_count, + events.len() + ); + + // 5. Date range + let (earliest, latest) = calculate_actual_date_range(events); + println!("📅 Actual date range: {} to {}", earliest, latest); + + // Final summary + println!("\n=== VALIDATION SUMMARY ==="); + println!(" • Total events: {}", events.len()); + println!( + " • Events with descriptions [%]: {}", + (desc_count * 100) / events.len().max(1) + ); + println!( + " • Complete events [%]: {}", + (complete_count * 100) / events.len().max(1) + ); + println!(" • Date range: {} to {}", earliest, latest); + + Ok(()) } fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) { @@ -369,71 +429,6 @@ fn extract_date_from_german_format(german_date: &str) -> Option { NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok() } -fn parse_german_date(german_date: &str) -> Option { - if german_date.trim().is_empty() { - return None; - } - - // Map German month names to numbers - let month_map = [ - ("Januar", 1), ("Februar", 2), ("März", 3), ("April", 4), - ("Mai", 5), ("Juni", 6), ("Juli", 7), ("August", 8), - ("September", 9), ("Oktober", 10), ("November", 11), ("Dezember", 12) - ]; - - // Parse German format: "Montag, 30. April 2007" - let pattern = r"(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})"; - let re = regex::Regex::new(pattern).unwrap(); - - if let Some(caps) = re.captures(german_date) { - let day = caps.get(1).unwrap().as_str().parse::().ok()?; - let german_month = caps.get(2).unwrap().as_str(); - let year = caps.get(3).unwrap().as_str().parse::().ok()?; - - // Find the month number - let month = month_map.iter() - .find(|(name, _)| *name == german_month) - .map(|(_, num)| *num)?; - - NaiveDate::from_ymd_opt(year, month, day) - } else { - None - } -} - -fn calculate_next_start_date(events: &[EconomicEvent]) -> Result { - // Try to find dates in ISO format first - let iso_dates: Vec = events - .iter() - .filter_map(|e| NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok()) - .collect(); - - if !iso_dates.is_empty() { - if let Some(latest) = iso_dates.iter().max() { - let next_date = (*latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string(); - println!("📅 Calculated next start date from ISO: {} (from latest: {})", next_date, latest); - return Ok(next_date); - } - } - - // Fallback: parse German dates - println!("⚠️ No ISO dates found, trying to parse German dates..."); - let german_dates: Vec = events - .iter() - .filter_map(|e| parse_german_date(&e.date)) - .collect(); - - if let Some(latest) = german_dates.iter().max() { - let next_date = (*latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string(); - println!("📅 Calculated next start date from German: {} (from latest: {})", next_date, latest); - Ok(next_date) - } else { - // Final fallback: use manual date increment - println!("❌ No parseable dates found, using manual increment"); - Err(anyhow::anyhow!("No parseable dates found")) - } -} - async fn scrape_all_events_with_chunking( client: &fantoccini::Client, start_date: &str, @@ -452,62 +447,41 @@ async fn scrape_all_events_with_chunking( println!("⚠️ Reached maximum attempts ({})", max_attempts); break; } - + println!("🚀 Chunk {}: {} to {}", attempts, current_start, end_date); - - // Set dates for current chunk + set_date_range(client, ¤t_start, end_date).await?; - - // Wait a bit longer for table to load sleep(Duration::from_secs(3)).await; - - // Extract events - let chunk_events = extract_all_data_via_js(client).await?; - - if chunk_events.is_empty() { + + let chunk = extract_all_data_via_js(client).await?; + if chunk.is_empty() { println!("✅ No more events found. Completed!"); break; } - - // Add to total - let chunk_count = chunk_events.len(); - all_events.extend(chunk_events.clone()); - - println!("📊 Chunk {}: {} events (Total: {})", - attempts, chunk_count, all_events.len()); - - // Debug: check what dates we got - let sample_dates: Vec<&str> = chunk_events.iter() - .map(|e| e.date.as_str()) - .filter(|d| !d.is_empty()) - .take(3) - .collect(); - println!(" Sample dates in chunk: {:?}", sample_dates); - // Calculate next start date - match calculate_next_start_date(&chunk_events) { - Ok(next_start) => { - if next_start > end_date.to_string() { - println!("✅ Reached end date. Completed!"); - break; - } - current_start = next_start; - } + println!("📊 Chunk {}: {} events (Total: {})", + attempts, chunk.len(), all_events.len() + chunk.len()); + + all_events.extend(chunk.clone()); + + let next = match calculate_next_start_date(&chunk) { + Ok(n) => n, Err(_) => { println!("❌ Could not calculate next start date. Stopping."); break; } + }; + + if next > end_date.to_string() { + println!("✅ Reached end date. Completed!"); + break; } - // Small delay between requests + current_start = next; + sleep(Duration::from_secs(2)).await; - // Export chunk - if let Ok(json) = serde_json::to_string_pretty(&chunk_events) { - let filename = format!("economic_events_{}_chunk_{}.json", json_export_now, attempts); - tokio::fs::write(&filename, json).await?; - println!(" Chunk data exported to: {}", filename); - } + export_chunk(&chunk, attempts, json_export_now.to_string().clone()).await?; } // Remove duplicates @@ -527,49 +501,11 @@ async fn scrape_all_events_with_chunking( Ok(all_events) } -async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> { - let set_dates_script = format!(r#" - (() => {{ - const fromInput = document.querySelector('#dtTeletraderFromDate'); - const toInput = document.querySelector('#dtTeletraderEndDate'); - - if (fromInput) {{ - fromInput.value = '{}'; - fromInput.dispatchEvent(new Event('input', {{ bubbles: true }})); - fromInput.dispatchEvent(new Event('change', {{ bubbles: true }})); - }} - - if (toInput) {{ - toInput.value = '{}'; - toInput.dispatchEvent(new Event('input', {{ bubbles: true }})); - toInput.dispatchEvent(new Event('change', {{ bubbles: true }})); - }} - - return !!fromInput && !!toInput; - }})() - "#, start, end); - - client.execute(&set_dates_script, vec![]).await?; - sleep(Duration::from_millis(1000)).await; // Wait for table to update - - // Now read the values - let from_date_value: String = client.execute( - r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#, - vec![], - ).await?.as_str().unwrap_or_default().to_string(); - - let to_date_value: String = client.execute( - r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#, - vec![], - ).await?.as_str().unwrap_or_default().to_string(); - - if from_date_value == start && to_date_value == end { - println!(" Dates set correctly"); - } else { - println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}", - start, end, from_date_value, to_date_value); - } - +async fn export_chunk(chunk: &[EconomicEvent], n: usize, ts: String) -> anyhow::Result<()> { + let filename = format!("economic_events_{}_chunk_{}.json", ts, n); + let json = serde_json::to_string_pretty(chunk)?; + tokio::fs::write(&filename, json).await?; + println!(" Chunk data exported to: {}", filename); Ok(()) } @@ -634,25 +570,6 @@ async fn main() -> anyhow::Result<()> { validate_events(&events).await?; check_data_consistency(&events).await; - // Calculate actual date range from extracted data - let actual_date_range = calculate_actual_date_range(&events); - let current_date = chrono::Local::now().format("%Y-%m-%d").to_string(); - - // Final summary - println!("\n🎯 EXTRACTION SUMMARY:"); - println!(" • Total high-importance events: {}", events.len()); - println!(" • Requested range: 2007-02-13 to 2025-12-01"); - println!(" • Actual extracted range: {} to {}", actual_date_range.0, actual_date_range.1); - println!(" • Data extracted until: {}", current_date); - println!(" • Data quality: {}% complete", - (events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1)); - - // Check coverage - if actual_date_range.1 < "2025-12-01".to_string() { - println!("⚠️ WARNING: Did not reach end date. Last extracted date: {}", actual_date_range.1); - println!(" • Next run should start from: {}", calculate_next_start_date(&events).unwrap_or_else(|_| actual_date_range.1)); - } - // Export for further analysis if let Ok(json) = serde_json::to_string_pretty(&events) { let filename = format!("economic_events_{}_combined.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));