persistent scraping
This commit is contained in:
646
src/main.rs
646
src/main.rs
@@ -1,11 +1,15 @@
|
||||
use chrono::{NaiveDate};
|
||||
use chrono::{NaiveDate, Datelike};
|
||||
use fantoccini::{ClientBuilder, Locator};
|
||||
use serde::Serialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value};
|
||||
use std::{process::Command};
|
||||
use tokio::{time::{Duration, sleep}, signal};
|
||||
use std::{path::PathBuf, process::Command};
|
||||
use tokio::{
|
||||
fs,
|
||||
signal,
|
||||
time::{sleep, Duration},
|
||||
};
|
||||
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
|
||||
struct EconomicEvent {
|
||||
country: String,
|
||||
date: String,
|
||||
@@ -18,6 +22,14 @@ struct EconomicEvent {
|
||||
description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ChunkInfo {
|
||||
start_date: String,
|
||||
end_date: String,
|
||||
path: PathBuf,
|
||||
event_count: usize,
|
||||
}
|
||||
|
||||
fn start_chromedriver(port: u16) -> std::process::Child {
|
||||
Command::new("chromedriver-win64/chromedriver.exe")
|
||||
.args(&[format!("--port={}", port)])
|
||||
@@ -26,10 +38,10 @@ fn start_chromedriver(port: u16) -> std::process::Child {
|
||||
}
|
||||
|
||||
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||
// Single strategy: wait for and remove iframe
|
||||
for _ in 0..10 {
|
||||
let removed: bool = client.execute(
|
||||
r#"(() => {
|
||||
let removed: bool = client
|
||||
.execute(
|
||||
r#"(() => {
|
||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||
if (iframe && iframe.parentNode) {
|
||||
iframe.parentNode.removeChild(iframe);
|
||||
@@ -37,18 +49,23 @@ async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||
}
|
||||
return false;
|
||||
})()"#,
|
||||
vec![]
|
||||
).await?.as_bool().unwrap_or(false);
|
||||
|
||||
if removed { break; }
|
||||
vec![],
|
||||
)
|
||||
.await?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
|
||||
if removed {
|
||||
break;
|
||||
}
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
println!("Extracting ONLY 3-star events via JavaScript...");
|
||||
|
||||
async fn extract_all_data_via_js(
|
||||
client: &fantoccini::Client,
|
||||
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
let extraction_script = r#"
|
||||
const events = [];
|
||||
let currentDate = '';
|
||||
@@ -60,29 +77,23 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
const cells = row.querySelectorAll('td');
|
||||
|
||||
if (cells.length === 1 && cells[0].colSpan === 9) {
|
||||
// This is a date header row - extract and parse the date
|
||||
const dateText = cells[0].textContent.trim();
|
||||
console.log('Found date header:', dateText);
|
||||
|
||||
// Convert German date to ISO format (YYYY-MM-DD)
|
||||
const monthMap = {
|
||||
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
|
||||
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
|
||||
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
|
||||
};
|
||||
|
||||
// Extract date parts from German format "Montag, 30. April 2007"
|
||||
const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})/);
|
||||
const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-ZäöüßÄÖÜ]+)\s+(\d{4})/);
|
||||
if (dateParts) {
|
||||
const day = dateParts[1].padStart(2, '0');
|
||||
const germanMonth = dateParts[2];
|
||||
const year = dateParts[3];
|
||||
const month = monthMap[germanMonth] || '01';
|
||||
currentDate = `${year}-${month}-${day}`;
|
||||
console.log('Converted date:', currentDate, 'from:', dateText);
|
||||
} else {
|
||||
console.log('Failed to parse date:', dateText);
|
||||
currentDate = ''; // Reset if parsing fails
|
||||
currentDate = '';
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -94,11 +105,9 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
|
||||
if (!time || !country || !eventName) continue;
|
||||
|
||||
// Count ONLY YELLOW stars (high importance)
|
||||
const importanceCell = cells[3];
|
||||
const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
|
||||
|
||||
// STRICT FILTER: Only include events with EXACTLY 3 YELLOW stars
|
||||
if (yellowStarCount === 3) {
|
||||
let description = '';
|
||||
if (i + 1 < rows.length) {
|
||||
@@ -114,7 +123,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
|
||||
events.push({
|
||||
country: country,
|
||||
date: currentDate, // Now using ISO format date
|
||||
date: currentDate,
|
||||
time: time,
|
||||
event: eventName,
|
||||
actual: cells[7]?.textContent?.trim() || '',
|
||||
@@ -127,64 +136,79 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
}
|
||||
}
|
||||
|
||||
console.log('Total events extracted:', events.length);
|
||||
if (events.length > 0) {
|
||||
console.log('First event date:', events[0].date);
|
||||
console.log('Last event date:', events[events.length - 1].date);
|
||||
}
|
||||
|
||||
return events;
|
||||
"#;
|
||||
|
||||
|
||||
let result = client.execute(extraction_script, vec![]).await?;
|
||||
|
||||
// Parse the JSON result into EconomicEvent structs
|
||||
|
||||
if let Some(events_array) = result.as_array() {
|
||||
let mut events = Vec::new();
|
||||
for event_value in events_array {
|
||||
if let Some(event_obj) = event_value.as_object() {
|
||||
let event = EconomicEvent {
|
||||
country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
country: event_obj
|
||||
.get("country")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
date: event_obj
|
||||
.get("date")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
time: event_obj
|
||||
.get("time")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
event: event_obj
|
||||
.get("event")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
actual: event_obj
|
||||
.get("actual")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
forecast: event_obj
|
||||
.get("forecast")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
previous: event_obj
|
||||
.get("previous")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
importance: event_obj
|
||||
.get("importance")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
description: event_obj
|
||||
.get("description")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
};
|
||||
events.push(event);
|
||||
}
|
||||
}
|
||||
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
|
||||
|
||||
// Debug: show date range of extracted events
|
||||
if !events.is_empty() {
|
||||
let dates: Vec<&str> = events.iter().map(|e| e.date.as_str()).filter(|d| !d.is_empty()).collect();
|
||||
if !dates.is_empty() {
|
||||
let min_date = dates.iter().min().unwrap_or(&"N/A");
|
||||
let max_date = dates.iter().max().unwrap_or(&"N/A");
|
||||
println!("📅 Extracted date range: {} to {}", min_date, max_date);
|
||||
|
||||
// Show sample of dates for debugging
|
||||
println!("Sample dates:");
|
||||
for (i, date) in dates.iter().take(5).enumerate() {
|
||||
println!(" {}. {}", i + 1, date);
|
||||
}
|
||||
} else {
|
||||
println!("❌ No valid dates found in extracted events");
|
||||
}
|
||||
}
|
||||
|
||||
println!("Extracted {} events (3 YELLOW stars ONLY)", events.len());
|
||||
return Ok(events);
|
||||
}
|
||||
|
||||
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||
let set_dates_script = format!(r#"
|
||||
async fn set_date_range(
|
||||
client: &fantoccini::Client,
|
||||
start: &str,
|
||||
end: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let set_dates_script = format!(
|
||||
r#"
|
||||
(() => {{
|
||||
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
||||
const toInput = document.querySelector('#dtTeletraderEndDate');
|
||||
@@ -203,62 +227,22 @@ async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) ->
|
||||
|
||||
return !!fromInput && !!toInput;
|
||||
}})()
|
||||
"#, start, end);
|
||||
"#,
|
||||
start, end
|
||||
);
|
||||
|
||||
client.execute(&set_dates_script, vec![]).await?;
|
||||
sleep(Duration::from_millis(1000)).await; // Wait for table to update
|
||||
sleep(Duration::from_millis(1000)).await;
|
||||
|
||||
// Now read the values
|
||||
let from_date_value: String = client.execute(
|
||||
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
|
||||
vec![],
|
||||
).await?.as_str().unwrap_or_default().to_string();
|
||||
|
||||
let to_date_value: String = client.execute(
|
||||
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
|
||||
vec![],
|
||||
).await?.as_str().unwrap_or_default().to_string();
|
||||
|
||||
if from_date_value == start && to_date_value == end {
|
||||
println!(" Dates set correctly");
|
||||
} else {
|
||||
println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}",
|
||||
start, end, from_date_value, to_date_value);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_any_date(date: &str) -> Option<NaiveDate> {
|
||||
// Attempt ISO first
|
||||
if let Ok(d) = NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d") {
|
||||
return Some(d);
|
||||
}
|
||||
|
||||
// Convert German → English once
|
||||
let month_map = [
|
||||
("Januar", "January"), ("Februar", "February"), ("März", "March"),
|
||||
("April", "April"), ("Mai", "May"), ("Juni", "June"),
|
||||
("Juli", "July"), ("August", "August"), ("September", "September"),
|
||||
("Oktober", "October"), ("November", "November"), ("Dezember", "December"),
|
||||
];
|
||||
|
||||
let mut english = date.to_string();
|
||||
for (de, en) in month_map {
|
||||
english = english.replace(de, en);
|
||||
}
|
||||
|
||||
// Try two formats max
|
||||
NaiveDate::parse_from_str(&english, "%A, %d. %B %Y")
|
||||
.or_else(|_| NaiveDate::parse_from_str(&english, "%d. %B %Y"))
|
||||
.ok()
|
||||
fn parse_date(date: &str) -> Option<NaiveDate> {
|
||||
NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d").ok()
|
||||
}
|
||||
|
||||
fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String> {
|
||||
let mut dates: Vec<_> = events
|
||||
.iter()
|
||||
.filter_map(|e| parse_any_date(&e.date))
|
||||
.collect();
|
||||
let mut dates: Vec<_> = events.iter().filter_map(|e| parse_date(&e.date)).collect();
|
||||
|
||||
if dates.is_empty() {
|
||||
return Err(anyhow::anyhow!("No parseable dates found"));
|
||||
@@ -270,242 +254,220 @@ fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String>
|
||||
Ok(next.format("%Y-%m-%d").to_string())
|
||||
}
|
||||
|
||||
fn extract_month(date: &str) -> Option<String> {
|
||||
parse_any_date(date).map(|d| d.format("%B").to_string())
|
||||
}
|
||||
|
||||
fn count_valid_times(events: &[EconomicEvent]) -> usize {
|
||||
events.iter().filter(|e| {
|
||||
e.time.len() == 5 &&
|
||||
e.time.as_bytes()[2] == b':' &&
|
||||
e.time[..2].chars().all(|c| c.is_ascii_digit()) &&
|
||||
e.time[3..].chars().all(|c| c.is_ascii_digit())
|
||||
}).count()
|
||||
}
|
||||
|
||||
fn missing_critical(e: &EconomicEvent) -> bool {
|
||||
e.event.trim().is_empty() || e.time.trim().is_empty()
|
||||
}
|
||||
|
||||
fn is_complete(e: &EconomicEvent) -> bool {
|
||||
!(e.event.trim().is_empty() ||
|
||||
e.time.trim().is_empty() ||
|
||||
e.country.trim().is_empty()) &&
|
||||
(!e.actual.trim().is_empty() ||
|
||||
!e.forecast.trim().is_empty() ||
|
||||
!e.previous.trim().is_empty())
|
||||
}
|
||||
|
||||
async fn check_data_consistency(events: &[EconomicEvent]) {
|
||||
println!("\n=== DATA CONSISTENCY CHECKS ===");
|
||||
|
||||
println!("⏰ Valid time formats: {}/{}", count_valid_times(events), events.len());
|
||||
|
||||
let missing: Vec<_> = events.iter().enumerate()
|
||||
.filter(|(_, e)| missing_critical(e))
|
||||
.collect();
|
||||
|
||||
if !missing.is_empty() {
|
||||
println!("❌ {} events missing critical fields", missing.len());
|
||||
}
|
||||
}
|
||||
|
||||
async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> {
|
||||
println!("\n=== EVENT VALIDATION ===");
|
||||
|
||||
if events.is_empty() {
|
||||
println!("❌ ERROR: No events extracted!");
|
||||
return Ok(());
|
||||
/// Scan the economic_events directory for existing chunks
|
||||
async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
let events_dir = PathBuf::from("economic_events");
|
||||
|
||||
// Create directory if it doesn't exist
|
||||
if !events_dir.exists() {
|
||||
fs::create_dir_all(&events_dir).await?;
|
||||
println!("📁 Created economic_events directory");
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
println!("📊 Total events: {}", events.len());
|
||||
let mut chunks = Vec::new();
|
||||
let mut entries = fs::read_dir(&events_dir).await?;
|
||||
|
||||
// 1. Description coverage
|
||||
let desc_count = events.iter()
|
||||
.filter(|e| !e.description.trim().is_empty())
|
||||
.count();
|
||||
|
||||
println!("📝 Events with descriptions: {}/{}", desc_count, events.len());
|
||||
|
||||
// 2. Distributions
|
||||
use std::collections::HashMap;
|
||||
let mut country_dist: HashMap<String, usize> = HashMap::new();
|
||||
let mut month_dist: HashMap<String, usize> = HashMap::new();
|
||||
|
||||
for e in events {
|
||||
*country_dist.entry(e.country.clone()).or_insert(0) += 1;
|
||||
|
||||
if let Some(month) = extract_month(&e.date) {
|
||||
*month_dist.entry(month).or_insert(0) += 1;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||
if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
|
||||
// Parse filename: chunk_{startdate}_{enddate}.json
|
||||
if let Some(dates) = filename.strip_prefix("chunk_") {
|
||||
let parts: Vec<&str> = dates.split('_').collect();
|
||||
if parts.len() == 2 {
|
||||
// Load and count events
|
||||
if let Ok(content) = fs::read_to_string(&path).await {
|
||||
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||
chunks.push(ChunkInfo {
|
||||
start_date: parts[0].to_string(),
|
||||
end_date: parts[1].to_string(),
|
||||
path: path.clone(),
|
||||
event_count: events.len(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("🌍 Country distribution: {:?}", country_dist);
|
||||
println!("📈 Month distribution: {:?}", month_dist);
|
||||
|
||||
// 3. Sample events (first 5)
|
||||
println!("\n🔍 Sample events (first 5):");
|
||||
for event in events.iter().take(5) {
|
||||
println!(
|
||||
" • {} {}: {} - {} (Importance: {})",
|
||||
event.date, event.time, event.country, event.event, event.importance
|
||||
);
|
||||
chunks.sort_by(|a, b| a.start_date.cmp(&b.start_date));
|
||||
|
||||
if !chunks.is_empty() {
|
||||
println!("\n📊 Found {} existing chunks:", chunks.len());
|
||||
for chunk in &chunks {
|
||||
println!(" • {} to {} ({} events)",
|
||||
chunk.start_date, chunk.end_date, chunk.event_count);
|
||||
}
|
||||
} else {
|
||||
println!("📭 No existing chunks found");
|
||||
}
|
||||
|
||||
// 4. Completeness check
|
||||
let complete_count = events.iter().filter(|e| is_complete(e)).count();
|
||||
println!(
|
||||
"✅ Complete events: {}/{}",
|
||||
complete_count,
|
||||
events.len()
|
||||
);
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
// 5. Date range
|
||||
let (earliest, latest) = calculate_actual_date_range(events);
|
||||
println!("📅 Actual date range: {} to {}", earliest, latest);
|
||||
/// Calculate target end date: first day of month, 3 months from now
|
||||
fn calculate_target_end_date() -> String {
|
||||
let now = chrono::Local::now().naive_local().date();
|
||||
let three_months_ahead = if now.month() + 3 > 12 {
|
||||
NaiveDate::from_ymd_opt(now.year() + 1, (now.month() + 3) % 12, 1)
|
||||
} else {
|
||||
NaiveDate::from_ymd_opt(now.year(), now.month() + 3, 1)
|
||||
}.unwrap();
|
||||
|
||||
three_months_ahead.format("%Y-%m-%d").to_string()
|
||||
}
|
||||
|
||||
// Final summary
|
||||
println!("\n=== VALIDATION SUMMARY ===");
|
||||
println!(" • Total events: {}", events.len());
|
||||
println!(
|
||||
" • Events with descriptions [%]: {}",
|
||||
(desc_count * 100) / events.len().max(1)
|
||||
);
|
||||
println!(
|
||||
" • Complete events [%]: {}",
|
||||
(complete_count * 100) / events.len().max(1)
|
||||
);
|
||||
println!(" • Date range: {} to {}", earliest, latest);
|
||||
/// Determine what date range needs to be scraped based on existing data
|
||||
fn determine_scrape_range(chunks: &[ChunkInfo], target_end: &str) -> Option<(String, String)> {
|
||||
let now = chrono::Local::now().naive_local().date().format("%Y-%m-%d").to_string();
|
||||
|
||||
if chunks.is_empty() {
|
||||
// No data exists, start from beginning
|
||||
println!("📭 No existing data - scraping from 2007-02-13 to {}", target_end);
|
||||
return Some(("2007-02-13".to_string(), target_end.to_string()));
|
||||
}
|
||||
|
||||
// Find the latest date in existing chunks
|
||||
let latest_chunk_date = chunks.iter()
|
||||
.map(|c| &c.end_date)
|
||||
.max()
|
||||
.cloned()
|
||||
.unwrap_or_else(|| "2007-02-13".to_string());
|
||||
|
||||
println!("📊 Latest existing data: {}", latest_chunk_date);
|
||||
|
||||
if latest_chunk_date >= now {
|
||||
// Data is ahead of current date - update from now to target
|
||||
println!("🔄 Data exists beyond today - updating from {} to {}", now, target_end);
|
||||
Some((now, target_end.to_string()))
|
||||
} else {
|
||||
// Data is behind - continue from where it left off
|
||||
let next_start = parse_date(&latest_chunk_date)
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or_else(|| latest_chunk_date.clone());
|
||||
|
||||
println!("➡️ Continuing from {} to {}", next_start, target_end);
|
||||
Some((next_start, target_end.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Save a chunk to disk
|
||||
async fn save_chunk(events: &[EconomicEvent], start: &str, end: &str) -> anyhow::Result<()> {
|
||||
let events_dir = PathBuf::from("economic_events");
|
||||
fs::create_dir_all(&events_dir).await?;
|
||||
|
||||
let filename = format!("chunk_{}_{}.json", start, end);
|
||||
let filepath = events_dir.join(&filename);
|
||||
|
||||
let json = serde_json::to_string_pretty(events)?;
|
||||
fs::write(&filepath, json).await?;
|
||||
|
||||
println!("💾 Saved chunk: {} ({} events)", filename, events.len());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) {
|
||||
if events.is_empty() {
|
||||
return ("No data".to_string(), "No data".to_string());
|
||||
/// Load all events from existing chunks
|
||||
async fn load_all_events(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
let mut all_events = Vec::new();
|
||||
|
||||
for chunk in chunks {
|
||||
if let Ok(content) = fs::read_to_string(&chunk.path).await {
|
||||
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||
all_events.extend(events);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut dates: Vec<NaiveDate> = events
|
||||
.iter()
|
||||
.filter_map(|e| {
|
||||
// Parse German date format "Dienstag, 2. Januar 2024"
|
||||
extract_date_from_german_format(&e.date)
|
||||
})
|
||||
.collect();
|
||||
|
||||
dates.sort();
|
||||
|
||||
let earliest = dates.first().map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
let latest = dates.last().map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
|
||||
(earliest, latest)
|
||||
|
||||
println!("📥 Loaded {} events from existing chunks", all_events.len());
|
||||
Ok(all_events)
|
||||
}
|
||||
|
||||
fn extract_date_from_german_format(german_date: &str) -> Option<NaiveDate> {
|
||||
// Map German month names to English
|
||||
let month_map = [
|
||||
("Januar", "January"),
|
||||
("Februar", "February"),
|
||||
("März", "March"),
|
||||
("April", "April"),
|
||||
("Mai", "May"),
|
||||
("Juni", "June"),
|
||||
("Juli", "July"),
|
||||
("August", "August"),
|
||||
("September", "September"),
|
||||
("Oktober", "October"),
|
||||
("November", "November"),
|
||||
("Dezember", "December"),
|
||||
];
|
||||
/// Scrape events for a specific date range and save chunks immediately
|
||||
async fn scrape_date_range(
|
||||
client: &fantoccini::Client,
|
||||
start: &str,
|
||||
end: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
println!("\n🎯 Scraping range: {} to {}", start, end);
|
||||
|
||||
let mut english_date = german_date.to_string();
|
||||
for (de, en) in &month_map {
|
||||
english_date = english_date.replace(de, en);
|
||||
}
|
||||
|
||||
// Parse "Tuesday, 2. January 2024" format
|
||||
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
|
||||
}
|
||||
|
||||
async fn scrape_all_events_with_chunking(
|
||||
client: &fantoccini::Client,
|
||||
start_date: &str,
|
||||
end_date: &str
|
||||
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
let json_export_now = chrono::Local::now().format("%Y%m%d_%H%M%S");
|
||||
|
||||
let mut all_events: Vec<EconomicEvent> = Vec::new();
|
||||
let mut current_start = start_date.to_string();
|
||||
let mut attempts = 0;
|
||||
let max_attempts = 300;
|
||||
let mut current_start = start.to_string();
|
||||
let mut chunk_number = 0;
|
||||
|
||||
loop {
|
||||
attempts += 1;
|
||||
if attempts > max_attempts {
|
||||
println!("⚠️ Reached maximum attempts ({})", max_attempts);
|
||||
break;
|
||||
}
|
||||
|
||||
println!("🚀 Chunk {}: {} to {}", attempts, current_start, end_date);
|
||||
|
||||
set_date_range(client, ¤t_start, end_date).await?;
|
||||
set_date_range(client, ¤t_start, end).await?;
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
let chunk = extract_all_data_via_js(client).await?;
|
||||
if chunk.is_empty() {
|
||||
println!("✅ No more events found. Completed!");
|
||||
let events = extract_all_data_via_js(client).await?;
|
||||
if events.is_empty() {
|
||||
println!(" ✅ No more events in this range");
|
||||
break;
|
||||
}
|
||||
|
||||
println!("📊 Chunk {}: {} events (Total: {})",
|
||||
attempts, chunk.len(), all_events.len() + chunk.len());
|
||||
chunk_number += 1;
|
||||
println!(" 📦 Fetched {} events", events.len());
|
||||
|
||||
all_events.extend(chunk.clone());
|
||||
// Calculate actual date range of this chunk
|
||||
let chunk_start = events.iter()
|
||||
.filter_map(|e| parse_date(&e.date))
|
||||
.min()
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or_else(|| current_start.clone());
|
||||
|
||||
let chunk_end = events.iter()
|
||||
.filter_map(|e| parse_date(&e.date))
|
||||
.max()
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or_else(|| end.to_string());
|
||||
|
||||
let next = match calculate_next_start_date(&chunk) {
|
||||
// Save chunk immediately
|
||||
save_chunk(&events, &chunk_start, &chunk_end).await?;
|
||||
|
||||
let next = match calculate_next_start_date(&events) {
|
||||
Ok(n) => n,
|
||||
Err(_) => {
|
||||
println!("❌ Could not calculate next start date. Stopping.");
|
||||
println!(" ⚠️ Cannot calculate next date, stopping");
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
if next > end_date.to_string() {
|
||||
println!("✅ Reached end date. Completed!");
|
||||
if next > end.to_string() {
|
||||
println!(" ✅ Reached end of range");
|
||||
break;
|
||||
}
|
||||
|
||||
current_start = next;
|
||||
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
export_chunk(&chunk, attempts, json_export_now.to_string().clone()).await?;
|
||||
}
|
||||
|
||||
// Remove duplicates
|
||||
let initial_count = all_events.len();
|
||||
all_events.sort_by(|a, b| {
|
||||
a.date.cmp(&b.date)
|
||||
.then(a.time.cmp(&b.time))
|
||||
.then(a.event.cmp(&b.event))
|
||||
});
|
||||
all_events.dedup_by(|a, b| {
|
||||
a.date == b.date && a.time == b.time && a.event == b.event
|
||||
});
|
||||
|
||||
println!("🎯 FINAL: Collected {} unique events (removed {} duplicates)",
|
||||
all_events.len(), initial_count - all_events.len());
|
||||
|
||||
Ok(all_events)
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn export_chunk(chunk: &[EconomicEvent], n: usize, ts: String) -> anyhow::Result<()> {
|
||||
let filename = format!("economic_events_{}_chunk_{}.json", ts, n);
|
||||
let json = serde_json::to_string_pretty(chunk)?;
|
||||
tokio::fs::write(&filename, json).await?;
|
||||
println!(" Chunk data exported to: {}", filename);
|
||||
/// Main scraping logic with persistent storage
|
||||
async fn scrape_with_persistence(
|
||||
client: &fantoccini::Client,
|
||||
) -> anyhow::Result<()> {
|
||||
// Calculate target end date (3 months ahead, 1st of month)
|
||||
let target_end = calculate_target_end_date();
|
||||
println!("🎯 Target end date: {}", target_end);
|
||||
|
||||
// Scan for existing chunks
|
||||
let existing_chunks = scan_existing_chunks().await?;
|
||||
|
||||
// Determine what range needs to be scraped
|
||||
let scrape_range = determine_scrape_range(&existing_chunks, &target_end);
|
||||
|
||||
if let Some((start, end)) = scrape_range {
|
||||
// Scrape the needed range (saves chunks automatically)
|
||||
scrape_date_range(client, &start, &end).await?;
|
||||
println!("\n✅ Scraping complete!");
|
||||
} else {
|
||||
println!("✅ All data is up to date!");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -515,20 +477,20 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mut chromedriver = start_chromedriver(port);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
|
||||
// Chrome options
|
||||
let caps_value = serde_json::json!({
|
||||
"goog:chromeOptions": {
|
||||
"args": [
|
||||
"--disable-gpu",
|
||||
"--disable-notifications",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-blink-features=AutomationControlled"
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
],
|
||||
"excludeSwitches": ["enable-automation"]
|
||||
}
|
||||
});
|
||||
|
||||
let caps_map: Map<String, Value> = caps_value.as_object()
|
||||
let caps_map: Map<String, Value> = caps_value
|
||||
.as_object()
|
||||
.expect("Capabilities should be a JSON object")
|
||||
.clone();
|
||||
|
||||
@@ -537,48 +499,60 @@ async fn main() -> anyhow::Result<()> {
|
||||
.connect(&format!("http://localhost:{}", port))
|
||||
.await?;
|
||||
|
||||
// Setup graceful shutdown on Ctrl+C
|
||||
// Setup graceful shutdown
|
||||
let shutdown_client = client.clone();
|
||||
let shutdown_handle = tokio::spawn(async move {
|
||||
signal::ctrl_c().await.expect("Failed to listen for ctrl+c");
|
||||
tokio::spawn(async move {
|
||||
signal::ctrl_c()
|
||||
.await
|
||||
.expect("Failed to listen for ctrl+c");
|
||||
println!("\nCtrl+C received, shutting down...");
|
||||
shutdown_client.close().await.ok();
|
||||
chromedriver.kill().ok();
|
||||
std::process::exit(0);
|
||||
});
|
||||
|
||||
// Go to page
|
||||
// Navigate to page
|
||||
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
||||
client.goto(url).await?;
|
||||
|
||||
// Dismiss overlays
|
||||
dismiss_overlays(&client).await?;
|
||||
|
||||
// Click the high importance tab
|
||||
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||
// Click high importance tab
|
||||
if let Ok(tab) = client
|
||||
.find(Locator::Css(
|
||||
r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#,
|
||||
))
|
||||
.await
|
||||
{
|
||||
tab.click().await?;
|
||||
println!("High importance tab clicked");
|
||||
println!("✓ High importance tab clicked");
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
} else {
|
||||
println!("High importance tab not found");
|
||||
}
|
||||
|
||||
// Use chunking to extract all events across the entire date range
|
||||
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
||||
// Run persistent scraping
|
||||
scrape_with_persistence(&client).await?;
|
||||
|
||||
// Run validation suite
|
||||
validate_events(&events).await?;
|
||||
check_data_consistency(&events).await;
|
||||
|
||||
// Export for further analysis
|
||||
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
||||
let filename = format!("economic_events_{}_combined.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));
|
||||
tokio::fs::write(&filename, json).await?;
|
||||
println!(" • Combined data exported to: {}", filename);
|
||||
// Load and display summary
|
||||
let chunks = scan_existing_chunks().await?;
|
||||
let all_events = load_all_events(&chunks).await?;
|
||||
|
||||
println!("\n📊 FINAL SUMMARY:");
|
||||
println!(" • Total chunks: {}", chunks.len());
|
||||
println!(" • Total events: {}", all_events.len());
|
||||
|
||||
if !chunks.is_empty() {
|
||||
let dates: Vec<String> = all_events.iter()
|
||||
.filter_map(|e| parse_date(&e.date))
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.collect();
|
||||
if !dates.is_empty() {
|
||||
let min = dates.iter().min().unwrap();
|
||||
let max = dates.iter().max().unwrap();
|
||||
println!(" • Date range: {} to {}", min, max);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for Ctrl+C
|
||||
shutdown_handle.await.ok();
|
||||
client.close().await?;
|
||||
chromedriver.kill()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user