persistent scraping

This commit is contained in:
2025-11-17 15:20:09 +01:00
parent 0af0c1e615
commit c56fcfdd72

View File

@@ -1,11 +1,15 @@
use chrono::{NaiveDate}; use chrono::{NaiveDate, Datelike};
use fantoccini::{ClientBuilder, Locator}; use fantoccini::{ClientBuilder, Locator};
use serde::Serialize; use serde::{Deserialize, Serialize};
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use std::{process::Command}; use std::{path::PathBuf, process::Command};
use tokio::{time::{Duration, sleep}, signal}; use tokio::{
fs,
signal,
time::{sleep, Duration},
};
#[derive(Debug, Serialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
struct EconomicEvent { struct EconomicEvent {
country: String, country: String,
date: String, date: String,
@@ -18,6 +22,14 @@ struct EconomicEvent {
description: String, description: String,
} }
#[derive(Debug)]
struct ChunkInfo {
start_date: String,
end_date: String,
path: PathBuf,
event_count: usize,
}
fn start_chromedriver(port: u16) -> std::process::Child { fn start_chromedriver(port: u16) -> std::process::Child {
Command::new("chromedriver-win64/chromedriver.exe") Command::new("chromedriver-win64/chromedriver.exe")
.args(&[format!("--port={}", port)]) .args(&[format!("--port={}", port)])
@@ -26,9 +38,9 @@ fn start_chromedriver(port: u16) -> std::process::Child {
} }
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> { async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
// Single strategy: wait for and remove iframe
for _ in 0..10 { for _ in 0..10 {
let removed: bool = client.execute( let removed: bool = client
.execute(
r#"(() => { r#"(() => {
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]'); const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
if (iframe && iframe.parentNode) { if (iframe && iframe.parentNode) {
@@ -37,18 +49,23 @@ async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
} }
return false; return false;
})()"#, })()"#,
vec![] vec![],
).await?.as_bool().unwrap_or(false); )
.await?
.as_bool()
.unwrap_or(false);
if removed { break; } if removed {
break;
}
sleep(Duration::from_millis(500)).await; sleep(Duration::from_millis(500)).await;
} }
Ok(()) Ok(())
} }
async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> { async fn extract_all_data_via_js(
println!("Extracting ONLY 3-star events via JavaScript..."); client: &fantoccini::Client,
) -> anyhow::Result<Vec<EconomicEvent>> {
let extraction_script = r#" let extraction_script = r#"
const events = []; const events = [];
let currentDate = ''; let currentDate = '';
@@ -60,29 +77,23 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
const cells = row.querySelectorAll('td'); const cells = row.querySelectorAll('td');
if (cells.length === 1 && cells[0].colSpan === 9) { if (cells.length === 1 && cells[0].colSpan === 9) {
// This is a date header row - extract and parse the date
const dateText = cells[0].textContent.trim(); const dateText = cells[0].textContent.trim();
console.log('Found date header:', dateText);
// Convert German date to ISO format (YYYY-MM-DD)
const monthMap = { const monthMap = {
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04', 'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08', 'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12' 'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
}; };
// Extract date parts from German format "Montag, 30. April 2007" const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-ZäöüßÄÖÜ]+)\s+(\d{4})/);
const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})/);
if (dateParts) { if (dateParts) {
const day = dateParts[1].padStart(2, '0'); const day = dateParts[1].padStart(2, '0');
const germanMonth = dateParts[2]; const germanMonth = dateParts[2];
const year = dateParts[3]; const year = dateParts[3];
const month = monthMap[germanMonth] || '01'; const month = monthMap[germanMonth] || '01';
currentDate = `${year}-${month}-${day}`; currentDate = `${year}-${month}-${day}`;
console.log('Converted date:', currentDate, 'from:', dateText);
} else { } else {
console.log('Failed to parse date:', dateText); currentDate = '';
currentDate = ''; // Reset if parsing fails
} }
continue; continue;
} }
@@ -94,11 +105,9 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
if (!time || !country || !eventName) continue; if (!time || !country || !eventName) continue;
// Count ONLY YELLOW stars (high importance)
const importanceCell = cells[3]; const importanceCell = cells[3];
const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0; const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
// STRICT FILTER: Only include events with EXACTLY 3 YELLOW stars
if (yellowStarCount === 3) { if (yellowStarCount === 3) {
let description = ''; let description = '';
if (i + 1 < rows.length) { if (i + 1 < rows.length) {
@@ -114,7 +123,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
events.push({ events.push({
country: country, country: country,
date: currentDate, // Now using ISO format date date: currentDate,
time: time, time: time,
event: eventName, event: eventName,
actual: cells[7]?.textContent?.trim() || '', actual: cells[7]?.textContent?.trim() || '',
@@ -127,64 +136,79 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
} }
} }
console.log('Total events extracted:', events.length);
if (events.length > 0) {
console.log('First event date:', events[0].date);
console.log('Last event date:', events[events.length - 1].date);
}
return events; return events;
"#; "#;
let result = client.execute(extraction_script, vec![]).await?; let result = client.execute(extraction_script, vec![]).await?;
// Parse the JSON result into EconomicEvent structs
if let Some(events_array) = result.as_array() { if let Some(events_array) = result.as_array() {
let mut events = Vec::new(); let mut events = Vec::new();
for event_value in events_array { for event_value in events_array {
if let Some(event_obj) = event_value.as_object() { if let Some(event_obj) = event_value.as_object() {
let event = EconomicEvent { let event = EconomicEvent {
country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(), country: event_obj
date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(), .get("country")
time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(), .and_then(|v| v.as_str())
event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(), .unwrap_or("")
actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(), .to_string(),
forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(), date: event_obj
previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(), .get("date")
importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(), .and_then(|v| v.as_str())
description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(), .unwrap_or("")
.to_string(),
time: event_obj
.get("time")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
event: event_obj
.get("event")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
actual: event_obj
.get("actual")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
forecast: event_obj
.get("forecast")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
previous: event_obj
.get("previous")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
importance: event_obj
.get("importance")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
description: event_obj
.get("description")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
}; };
events.push(event); events.push(event);
} }
} }
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len()); println!("Extracted {} events (3 YELLOW stars ONLY)", events.len());
// Debug: show date range of extracted events
if !events.is_empty() {
let dates: Vec<&str> = events.iter().map(|e| e.date.as_str()).filter(|d| !d.is_empty()).collect();
if !dates.is_empty() {
let min_date = dates.iter().min().unwrap_or(&"N/A");
let max_date = dates.iter().max().unwrap_or(&"N/A");
println!("📅 Extracted date range: {} to {}", min_date, max_date);
// Show sample of dates for debugging
println!("Sample dates:");
for (i, date) in dates.iter().take(5).enumerate() {
println!(" {}. {}", i + 1, date);
}
} else {
println!("❌ No valid dates found in extracted events");
}
}
return Ok(events); return Ok(events);
} }
Ok(vec![]) Ok(vec![])
} }
async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> { async fn set_date_range(
let set_dates_script = format!(r#" client: &fantoccini::Client,
start: &str,
end: &str,
) -> anyhow::Result<()> {
let set_dates_script = format!(
r#"
(() => {{ (() => {{
const fromInput = document.querySelector('#dtTeletraderFromDate'); const fromInput = document.querySelector('#dtTeletraderFromDate');
const toInput = document.querySelector('#dtTeletraderEndDate'); const toInput = document.querySelector('#dtTeletraderEndDate');
@@ -203,62 +227,22 @@ async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) ->
return !!fromInput && !!toInput; return !!fromInput && !!toInput;
}})() }})()
"#, start, end); "#,
start, end
);
client.execute(&set_dates_script, vec![]).await?; client.execute(&set_dates_script, vec![]).await?;
sleep(Duration::from_millis(1000)).await; // Wait for table to update sleep(Duration::from_millis(1000)).await;
// Now read the values
let from_date_value: String = client.execute(
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
vec![],
).await?.as_str().unwrap_or_default().to_string();
let to_date_value: String = client.execute(
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
vec![],
).await?.as_str().unwrap_or_default().to_string();
if from_date_value == start && to_date_value == end {
println!(" Dates set correctly");
} else {
println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}",
start, end, from_date_value, to_date_value);
}
Ok(()) Ok(())
} }
fn parse_any_date(date: &str) -> Option<NaiveDate> { fn parse_date(date: &str) -> Option<NaiveDate> {
// Attempt ISO first NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d").ok()
if let Ok(d) = NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d") {
return Some(d);
}
// Convert German → English once
let month_map = [
("Januar", "January"), ("Februar", "February"), ("März", "March"),
("April", "April"), ("Mai", "May"), ("Juni", "June"),
("Juli", "July"), ("August", "August"), ("September", "September"),
("Oktober", "October"), ("November", "November"), ("Dezember", "December"),
];
let mut english = date.to_string();
for (de, en) in month_map {
english = english.replace(de, en);
}
// Try two formats max
NaiveDate::parse_from_str(&english, "%A, %d. %B %Y")
.or_else(|_| NaiveDate::parse_from_str(&english, "%d. %B %Y"))
.ok()
} }
fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String> { fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String> {
let mut dates: Vec<_> = events let mut dates: Vec<_> = events.iter().filter_map(|e| parse_date(&e.date)).collect();
.iter()
.filter_map(|e| parse_any_date(&e.date))
.collect();
if dates.is_empty() { if dates.is_empty() {
return Err(anyhow::anyhow!("No parseable dates found")); return Err(anyhow::anyhow!("No parseable dates found"));
@@ -270,242 +254,220 @@ fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String>
Ok(next.format("%Y-%m-%d").to_string()) Ok(next.format("%Y-%m-%d").to_string())
} }
fn extract_month(date: &str) -> Option<String> { /// Scan the economic_events directory for existing chunks
parse_any_date(date).map(|d| d.format("%B").to_string()) async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
} let events_dir = PathBuf::from("economic_events");
fn count_valid_times(events: &[EconomicEvent]) -> usize { // Create directory if it doesn't exist
events.iter().filter(|e| { if !events_dir.exists() {
e.time.len() == 5 && fs::create_dir_all(&events_dir).await?;
e.time.as_bytes()[2] == b':' && println!("📁 Created economic_events directory");
e.time[..2].chars().all(|c| c.is_ascii_digit()) && return Ok(vec![]);
e.time[3..].chars().all(|c| c.is_ascii_digit())
}).count()
}
fn missing_critical(e: &EconomicEvent) -> bool {
e.event.trim().is_empty() || e.time.trim().is_empty()
}
fn is_complete(e: &EconomicEvent) -> bool {
!(e.event.trim().is_empty() ||
e.time.trim().is_empty() ||
e.country.trim().is_empty()) &&
(!e.actual.trim().is_empty() ||
!e.forecast.trim().is_empty() ||
!e.previous.trim().is_empty())
}
async fn check_data_consistency(events: &[EconomicEvent]) {
println!("\n=== DATA CONSISTENCY CHECKS ===");
println!("⏰ Valid time formats: {}/{}", count_valid_times(events), events.len());
let missing: Vec<_> = events.iter().enumerate()
.filter(|(_, e)| missing_critical(e))
.collect();
if !missing.is_empty() {
println!("{} events missing critical fields", missing.len());
}
}
async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> {
println!("\n=== EVENT VALIDATION ===");
if events.is_empty() {
println!("❌ ERROR: No events extracted!");
return Ok(());
} }
println!("📊 Total events: {}", events.len()); let mut chunks = Vec::new();
let mut entries = fs::read_dir(&events_dir).await?;
// 1. Description coverage while let Some(entry) = entries.next_entry().await? {
let desc_count = events.iter() let path = entry.path();
.filter(|e| !e.description.trim().is_empty()) if path.extension().and_then(|s| s.to_str()) == Some("json") {
.count(); if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
// Parse filename: chunk_{startdate}_{enddate}.json
println!("📝 Events with descriptions: {}/{}", desc_count, events.len()); if let Some(dates) = filename.strip_prefix("chunk_") {
let parts: Vec<&str> = dates.split('_').collect();
// 2. Distributions if parts.len() == 2 {
use std::collections::HashMap; // Load and count events
let mut country_dist: HashMap<String, usize> = HashMap::new(); if let Ok(content) = fs::read_to_string(&path).await {
let mut month_dist: HashMap<String, usize> = HashMap::new(); if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
chunks.push(ChunkInfo {
for e in events { start_date: parts[0].to_string(),
*country_dist.entry(e.country.clone()).or_insert(0) += 1; end_date: parts[1].to_string(),
path: path.clone(),
if let Some(month) = extract_month(&e.date) { event_count: events.len(),
*month_dist.entry(month).or_insert(0) += 1; });
}
}
}
}
}
} }
} }
println!("🌍 Country distribution: {:?}", country_dist); chunks.sort_by(|a, b| a.start_date.cmp(&b.start_date));
println!("📈 Month distribution: {:?}", month_dist);
// 3. Sample events (first 5) if !chunks.is_empty() {
println!("\n🔍 Sample events (first 5):"); println!("\n📊 Found {} existing chunks:", chunks.len());
for event in events.iter().take(5) { for chunk in &chunks {
println!( println!("{} to {} ({} events)",
" {} {}: {} - {} (Importance: {})", chunk.start_date, chunk.end_date, chunk.event_count);
event.date, event.time, event.country, event.event, event.importance }
); } else {
println!("📭 No existing chunks found");
} }
// 4. Completeness check Ok(chunks)
let complete_count = events.iter().filter(|e| is_complete(e)).count(); }
println!(
"✅ Complete events: {}/{}",
complete_count,
events.len()
);
// 5. Date range /// Calculate target end date: first day of month, 3 months from now
let (earliest, latest) = calculate_actual_date_range(events); fn calculate_target_end_date() -> String {
println!("📅 Actual date range: {} to {}", earliest, latest); let now = chrono::Local::now().naive_local().date();
let three_months_ahead = if now.month() + 3 > 12 {
NaiveDate::from_ymd_opt(now.year() + 1, (now.month() + 3) % 12, 1)
} else {
NaiveDate::from_ymd_opt(now.year(), now.month() + 3, 1)
}.unwrap();
// Final summary three_months_ahead.format("%Y-%m-%d").to_string()
println!("\n=== VALIDATION SUMMARY ==="); }
println!(" • Total events: {}", events.len());
println!(
" • Events with descriptions [%]: {}",
(desc_count * 100) / events.len().max(1)
);
println!(
" • Complete events [%]: {}",
(complete_count * 100) / events.len().max(1)
);
println!(" • Date range: {} to {}", earliest, latest);
/// Determine what date range needs to be scraped based on existing data
fn determine_scrape_range(chunks: &[ChunkInfo], target_end: &str) -> Option<(String, String)> {
let now = chrono::Local::now().naive_local().date().format("%Y-%m-%d").to_string();
if chunks.is_empty() {
// No data exists, start from beginning
println!("📭 No existing data - scraping from 2007-02-13 to {}", target_end);
return Some(("2007-02-13".to_string(), target_end.to_string()));
}
// Find the latest date in existing chunks
let latest_chunk_date = chunks.iter()
.map(|c| &c.end_date)
.max()
.cloned()
.unwrap_or_else(|| "2007-02-13".to_string());
println!("📊 Latest existing data: {}", latest_chunk_date);
if latest_chunk_date >= now {
// Data is ahead of current date - update from now to target
println!("🔄 Data exists beyond today - updating from {} to {}", now, target_end);
Some((now, target_end.to_string()))
} else {
// Data is behind - continue from where it left off
let next_start = parse_date(&latest_chunk_date)
.and_then(|d| d.succ_opt())
.map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| latest_chunk_date.clone());
println!("➡️ Continuing from {} to {}", next_start, target_end);
Some((next_start, target_end.to_string()))
}
}
/// Save a chunk to disk
async fn save_chunk(events: &[EconomicEvent], start: &str, end: &str) -> anyhow::Result<()> {
let events_dir = PathBuf::from("economic_events");
fs::create_dir_all(&events_dir).await?;
let filename = format!("chunk_{}_{}.json", start, end);
let filepath = events_dir.join(&filename);
let json = serde_json::to_string_pretty(events)?;
fs::write(&filepath, json).await?;
println!("💾 Saved chunk: {} ({} events)", filename, events.len());
Ok(()) Ok(())
} }
fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) { /// Load all events from existing chunks
if events.is_empty() { async fn load_all_events(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<EconomicEvent>> {
return ("No data".to_string(), "No data".to_string()); let mut all_events = Vec::new();
for chunk in chunks {
if let Ok(content) = fs::read_to_string(&chunk.path).await {
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
all_events.extend(events);
}
}
} }
let mut dates: Vec<NaiveDate> = events println!("📥 Loaded {} events from existing chunks", all_events.len());
.iter() Ok(all_events)
.filter_map(|e| {
// Parse German date format "Dienstag, 2. Januar 2024"
extract_date_from_german_format(&e.date)
})
.collect();
dates.sort();
let earliest = dates.first().map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| "Unknown".to_string());
let latest = dates.last().map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| "Unknown".to_string());
(earliest, latest)
} }
fn extract_date_from_german_format(german_date: &str) -> Option<NaiveDate> { /// Scrape events for a specific date range and save chunks immediately
// Map German month names to English async fn scrape_date_range(
let month_map = [
("Januar", "January"),
("Februar", "February"),
("März", "March"),
("April", "April"),
("Mai", "May"),
("Juni", "June"),
("Juli", "July"),
("August", "August"),
("September", "September"),
("Oktober", "October"),
("November", "November"),
("Dezember", "December"),
];
let mut english_date = german_date.to_string();
for (de, en) in &month_map {
english_date = english_date.replace(de, en);
}
// Parse "Tuesday, 2. January 2024" format
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
}
async fn scrape_all_events_with_chunking(
client: &fantoccini::Client, client: &fantoccini::Client,
start_date: &str, start: &str,
end_date: &str end: &str,
) -> anyhow::Result<Vec<EconomicEvent>> { ) -> anyhow::Result<()> {
let json_export_now = chrono::Local::now().format("%Y%m%d_%H%M%S"); println!("\n🎯 Scraping range: {} to {}", start, end);
let mut all_events: Vec<EconomicEvent> = Vec::new(); let mut current_start = start.to_string();
let mut current_start = start_date.to_string(); let mut chunk_number = 0;
let mut attempts = 0;
let max_attempts = 300;
loop { loop {
attempts += 1; set_date_range(client, &current_start, end).await?;
if attempts > max_attempts {
println!("⚠️ Reached maximum attempts ({})", max_attempts);
break;
}
println!("🚀 Chunk {}: {} to {}", attempts, current_start, end_date);
set_date_range(client, &current_start, end_date).await?;
sleep(Duration::from_secs(3)).await; sleep(Duration::from_secs(3)).await;
let chunk = extract_all_data_via_js(client).await?; let events = extract_all_data_via_js(client).await?;
if chunk.is_empty() { if events.is_empty() {
println!("✅ No more events found. Completed!"); println!(" ✅ No more events in this range");
break; break;
} }
println!("📊 Chunk {}: {} events (Total: {})", chunk_number += 1;
attempts, chunk.len(), all_events.len() + chunk.len()); println!(" 📦 Fetched {} events", events.len());
all_events.extend(chunk.clone()); // Calculate actual date range of this chunk
let chunk_start = events.iter()
.filter_map(|e| parse_date(&e.date))
.min()
.map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| current_start.clone());
let next = match calculate_next_start_date(&chunk) { let chunk_end = events.iter()
.filter_map(|e| parse_date(&e.date))
.max()
.map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| end.to_string());
// Save chunk immediately
save_chunk(&events, &chunk_start, &chunk_end).await?;
let next = match calculate_next_start_date(&events) {
Ok(n) => n, Ok(n) => n,
Err(_) => { Err(_) => {
println!("❌ Could not calculate next start date. Stopping."); println!(" ⚠️ Cannot calculate next date, stopping");
break; break;
} }
}; };
if next > end_date.to_string() { if next > end.to_string() {
println!("✅ Reached end date. Completed!"); println!(" ✅ Reached end of range");
break; break;
} }
current_start = next; current_start = next;
sleep(Duration::from_secs(2)).await; sleep(Duration::from_secs(2)).await;
export_chunk(&chunk, attempts, json_export_now.to_string().clone()).await?;
} }
// Remove duplicates Ok(())
let initial_count = all_events.len();
all_events.sort_by(|a, b| {
a.date.cmp(&b.date)
.then(a.time.cmp(&b.time))
.then(a.event.cmp(&b.event))
});
all_events.dedup_by(|a, b| {
a.date == b.date && a.time == b.time && a.event == b.event
});
println!("🎯 FINAL: Collected {} unique events (removed {} duplicates)",
all_events.len(), initial_count - all_events.len());
Ok(all_events)
} }
async fn export_chunk(chunk: &[EconomicEvent], n: usize, ts: String) -> anyhow::Result<()> { /// Main scraping logic with persistent storage
let filename = format!("economic_events_{}_chunk_{}.json", ts, n); async fn scrape_with_persistence(
let json = serde_json::to_string_pretty(chunk)?; client: &fantoccini::Client,
tokio::fs::write(&filename, json).await?; ) -> anyhow::Result<()> {
println!(" Chunk data exported to: {}", filename); // Calculate target end date (3 months ahead, 1st of month)
let target_end = calculate_target_end_date();
println!("🎯 Target end date: {}", target_end);
// Scan for existing chunks
let existing_chunks = scan_existing_chunks().await?;
// Determine what range needs to be scraped
let scrape_range = determine_scrape_range(&existing_chunks, &target_end);
if let Some((start, end)) = scrape_range {
// Scrape the needed range (saves chunks automatically)
scrape_date_range(client, &start, &end).await?;
println!("\n✅ Scraping complete!");
} else {
println!("✅ All data is up to date!");
}
Ok(()) Ok(())
} }
@@ -515,20 +477,20 @@ async fn main() -> anyhow::Result<()> {
let mut chromedriver = start_chromedriver(port); let mut chromedriver = start_chromedriver(port);
sleep(Duration::from_secs(1)).await; sleep(Duration::from_secs(1)).await;
// Chrome options
let caps_value = serde_json::json!({ let caps_value = serde_json::json!({
"goog:chromeOptions": { "goog:chromeOptions": {
"args": [ "args": [
"--disable-gpu", "--disable-gpu",
"--disable-notifications", "--disable-notifications",
"--disable-popup-blocking", "--disable-popup-blocking",
"--disable-blink-features=AutomationControlled" "--disable-blink-features=AutomationControlled",
], ],
"excludeSwitches": ["enable-automation"] "excludeSwitches": ["enable-automation"]
} }
}); });
let caps_map: Map<String, Value> = caps_value.as_object() let caps_map: Map<String, Value> = caps_value
.as_object()
.expect("Capabilities should be a JSON object") .expect("Capabilities should be a JSON object")
.clone(); .clone();
@@ -537,48 +499,60 @@ async fn main() -> anyhow::Result<()> {
.connect(&format!("http://localhost:{}", port)) .connect(&format!("http://localhost:{}", port))
.await?; .await?;
// Setup graceful shutdown on Ctrl+C // Setup graceful shutdown
let shutdown_client = client.clone(); let shutdown_client = client.clone();
let shutdown_handle = tokio::spawn(async move { tokio::spawn(async move {
signal::ctrl_c().await.expect("Failed to listen for ctrl+c"); signal::ctrl_c()
.await
.expect("Failed to listen for ctrl+c");
println!("\nCtrl+C received, shutting down..."); println!("\nCtrl+C received, shutting down...");
shutdown_client.close().await.ok(); shutdown_client.close().await.ok();
chromedriver.kill().ok();
std::process::exit(0); std::process::exit(0);
}); });
// Go to page // Navigate to page
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/"; let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
client.goto(url).await?; client.goto(url).await?;
// Dismiss overlays
dismiss_overlays(&client).await?; dismiss_overlays(&client).await?;
// Click the high importance tab // Click high importance tab
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await { if let Ok(tab) = client
.find(Locator::Css(
r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#,
))
.await
{
tab.click().await?; tab.click().await?;
println!("High importance tab clicked"); println!("High importance tab clicked");
sleep(Duration::from_secs(2)).await; sleep(Duration::from_secs(2)).await;
} else {
println!("High importance tab not found");
} }
// Use chunking to extract all events across the entire date range // Run persistent scraping
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?; scrape_with_persistence(&client).await?;
// Run validation suite // Load and display summary
validate_events(&events).await?; let chunks = scan_existing_chunks().await?;
check_data_consistency(&events).await; let all_events = load_all_events(&chunks).await?;
// Export for further analysis println!("\n📊 FINAL SUMMARY:");
if let Ok(json) = serde_json::to_string_pretty(&events) { println!(" • Total chunks: {}", chunks.len());
let filename = format!("economic_events_{}_combined.json", chrono::Local::now().format("%Y%m%d_%H%M%S")); println!(" • Total events: {}", all_events.len());
tokio::fs::write(&filename, json).await?;
println!(" • Combined data exported to: {}", filename); if !chunks.is_empty() {
let dates: Vec<String> = all_events.iter()
.filter_map(|e| parse_date(&e.date))
.map(|d| d.format("%Y-%m-%d").to_string())
.collect();
if !dates.is_empty() {
let min = dates.iter().min().unwrap();
let max = dates.iter().max().unwrap();
println!(" • Date range: {} to {}", min, max);
}
} }
// Wait for Ctrl+C client.close().await?;
shutdown_handle.await.ok(); chromedriver.kill()?;
Ok(()) Ok(())
} }