persistent scraping
This commit is contained in:
646
src/main.rs
646
src/main.rs
@@ -1,11 +1,15 @@
|
|||||||
use chrono::{NaiveDate};
|
use chrono::{NaiveDate, Datelike};
|
||||||
use fantoccini::{ClientBuilder, Locator};
|
use fantoccini::{ClientBuilder, Locator};
|
||||||
use serde::Serialize;
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
use std::{process::Command};
|
use std::{path::PathBuf, process::Command};
|
||||||
use tokio::{time::{Duration, sleep}, signal};
|
use tokio::{
|
||||||
|
fs,
|
||||||
|
signal,
|
||||||
|
time::{sleep, Duration},
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
|
||||||
struct EconomicEvent {
|
struct EconomicEvent {
|
||||||
country: String,
|
country: String,
|
||||||
date: String,
|
date: String,
|
||||||
@@ -18,6 +22,14 @@ struct EconomicEvent {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct ChunkInfo {
|
||||||
|
start_date: String,
|
||||||
|
end_date: String,
|
||||||
|
path: PathBuf,
|
||||||
|
event_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
fn start_chromedriver(port: u16) -> std::process::Child {
|
fn start_chromedriver(port: u16) -> std::process::Child {
|
||||||
Command::new("chromedriver-win64/chromedriver.exe")
|
Command::new("chromedriver-win64/chromedriver.exe")
|
||||||
.args(&[format!("--port={}", port)])
|
.args(&[format!("--port={}", port)])
|
||||||
@@ -26,10 +38,10 @@ fn start_chromedriver(port: u16) -> std::process::Child {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||||
// Single strategy: wait for and remove iframe
|
|
||||||
for _ in 0..10 {
|
for _ in 0..10 {
|
||||||
let removed: bool = client.execute(
|
let removed: bool = client
|
||||||
r#"(() => {
|
.execute(
|
||||||
|
r#"(() => {
|
||||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
if (iframe && iframe.parentNode) {
|
if (iframe && iframe.parentNode) {
|
||||||
iframe.parentNode.removeChild(iframe);
|
iframe.parentNode.removeChild(iframe);
|
||||||
@@ -37,18 +49,23 @@ async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
})()"#,
|
})()"#,
|
||||||
vec![]
|
vec![],
|
||||||
).await?.as_bool().unwrap_or(false);
|
)
|
||||||
|
.await?
|
||||||
if removed { break; }
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if removed {
|
||||||
|
break;
|
||||||
|
}
|
||||||
sleep(Duration::from_millis(500)).await;
|
sleep(Duration::from_millis(500)).await;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
async fn extract_all_data_via_js(
|
||||||
println!("Extracting ONLY 3-star events via JavaScript...");
|
client: &fantoccini::Client,
|
||||||
|
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
let extraction_script = r#"
|
let extraction_script = r#"
|
||||||
const events = [];
|
const events = [];
|
||||||
let currentDate = '';
|
let currentDate = '';
|
||||||
@@ -60,29 +77,23 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
const cells = row.querySelectorAll('td');
|
const cells = row.querySelectorAll('td');
|
||||||
|
|
||||||
if (cells.length === 1 && cells[0].colSpan === 9) {
|
if (cells.length === 1 && cells[0].colSpan === 9) {
|
||||||
// This is a date header row - extract and parse the date
|
|
||||||
const dateText = cells[0].textContent.trim();
|
const dateText = cells[0].textContent.trim();
|
||||||
console.log('Found date header:', dateText);
|
|
||||||
|
|
||||||
// Convert German date to ISO format (YYYY-MM-DD)
|
|
||||||
const monthMap = {
|
const monthMap = {
|
||||||
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
|
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
|
||||||
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
|
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
|
||||||
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
|
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
|
||||||
};
|
};
|
||||||
|
|
||||||
// Extract date parts from German format "Montag, 30. April 2007"
|
const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-ZäöüßÄÖÜ]+)\s+(\d{4})/);
|
||||||
const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})/);
|
|
||||||
if (dateParts) {
|
if (dateParts) {
|
||||||
const day = dateParts[1].padStart(2, '0');
|
const day = dateParts[1].padStart(2, '0');
|
||||||
const germanMonth = dateParts[2];
|
const germanMonth = dateParts[2];
|
||||||
const year = dateParts[3];
|
const year = dateParts[3];
|
||||||
const month = monthMap[germanMonth] || '01';
|
const month = monthMap[germanMonth] || '01';
|
||||||
currentDate = `${year}-${month}-${day}`;
|
currentDate = `${year}-${month}-${day}`;
|
||||||
console.log('Converted date:', currentDate, 'from:', dateText);
|
|
||||||
} else {
|
} else {
|
||||||
console.log('Failed to parse date:', dateText);
|
currentDate = '';
|
||||||
currentDate = ''; // Reset if parsing fails
|
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -94,11 +105,9 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
|
|
||||||
if (!time || !country || !eventName) continue;
|
if (!time || !country || !eventName) continue;
|
||||||
|
|
||||||
// Count ONLY YELLOW stars (high importance)
|
|
||||||
const importanceCell = cells[3];
|
const importanceCell = cells[3];
|
||||||
const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
|
const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
|
||||||
|
|
||||||
// STRICT FILTER: Only include events with EXACTLY 3 YELLOW stars
|
|
||||||
if (yellowStarCount === 3) {
|
if (yellowStarCount === 3) {
|
||||||
let description = '';
|
let description = '';
|
||||||
if (i + 1 < rows.length) {
|
if (i + 1 < rows.length) {
|
||||||
@@ -114,7 +123,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
|
|
||||||
events.push({
|
events.push({
|
||||||
country: country,
|
country: country,
|
||||||
date: currentDate, // Now using ISO format date
|
date: currentDate,
|
||||||
time: time,
|
time: time,
|
||||||
event: eventName,
|
event: eventName,
|
||||||
actual: cells[7]?.textContent?.trim() || '',
|
actual: cells[7]?.textContent?.trim() || '',
|
||||||
@@ -127,64 +136,79 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log('Total events extracted:', events.length);
|
|
||||||
if (events.length > 0) {
|
|
||||||
console.log('First event date:', events[0].date);
|
|
||||||
console.log('Last event date:', events[events.length - 1].date);
|
|
||||||
}
|
|
||||||
|
|
||||||
return events;
|
return events;
|
||||||
"#;
|
"#;
|
||||||
|
|
||||||
let result = client.execute(extraction_script, vec![]).await?;
|
let result = client.execute(extraction_script, vec![]).await?;
|
||||||
|
|
||||||
// Parse the JSON result into EconomicEvent structs
|
|
||||||
if let Some(events_array) = result.as_array() {
|
if let Some(events_array) = result.as_array() {
|
||||||
let mut events = Vec::new();
|
let mut events = Vec::new();
|
||||||
for event_value in events_array {
|
for event_value in events_array {
|
||||||
if let Some(event_obj) = event_value.as_object() {
|
if let Some(event_obj) = event_value.as_object() {
|
||||||
let event = EconomicEvent {
|
let event = EconomicEvent {
|
||||||
country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
country: event_obj
|
||||||
date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
.get("country")
|
||||||
time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
.and_then(|v| v.as_str())
|
||||||
event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
.unwrap_or("")
|
||||||
actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
.to_string(),
|
||||||
forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
date: event_obj
|
||||||
previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
.get("date")
|
||||||
importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
.and_then(|v| v.as_str())
|
||||||
description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
|
time: event_obj
|
||||||
|
.get("time")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
|
event: event_obj
|
||||||
|
.get("event")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
|
actual: event_obj
|
||||||
|
.get("actual")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
|
forecast: event_obj
|
||||||
|
.get("forecast")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
|
previous: event_obj
|
||||||
|
.get("previous")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
|
importance: event_obj
|
||||||
|
.get("importance")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
|
description: event_obj
|
||||||
|
.get("description")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string(),
|
||||||
};
|
};
|
||||||
events.push(event);
|
events.push(event);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
|
println!("Extracted {} events (3 YELLOW stars ONLY)", events.len());
|
||||||
|
|
||||||
// Debug: show date range of extracted events
|
|
||||||
if !events.is_empty() {
|
|
||||||
let dates: Vec<&str> = events.iter().map(|e| e.date.as_str()).filter(|d| !d.is_empty()).collect();
|
|
||||||
if !dates.is_empty() {
|
|
||||||
let min_date = dates.iter().min().unwrap_or(&"N/A");
|
|
||||||
let max_date = dates.iter().max().unwrap_or(&"N/A");
|
|
||||||
println!("📅 Extracted date range: {} to {}", min_date, max_date);
|
|
||||||
|
|
||||||
// Show sample of dates for debugging
|
|
||||||
println!("Sample dates:");
|
|
||||||
for (i, date) in dates.iter().take(5).enumerate() {
|
|
||||||
println!(" {}. {}", i + 1, date);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
println!("❌ No valid dates found in extracted events");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(events);
|
return Ok(events);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(vec![])
|
Ok(vec![])
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> {
|
async fn set_date_range(
|
||||||
let set_dates_script = format!(r#"
|
client: &fantoccini::Client,
|
||||||
|
start: &str,
|
||||||
|
end: &str,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let set_dates_script = format!(
|
||||||
|
r#"
|
||||||
(() => {{
|
(() => {{
|
||||||
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
||||||
const toInput = document.querySelector('#dtTeletraderEndDate');
|
const toInput = document.querySelector('#dtTeletraderEndDate');
|
||||||
@@ -203,62 +227,22 @@ async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) ->
|
|||||||
|
|
||||||
return !!fromInput && !!toInput;
|
return !!fromInput && !!toInput;
|
||||||
}})()
|
}})()
|
||||||
"#, start, end);
|
"#,
|
||||||
|
start, end
|
||||||
|
);
|
||||||
|
|
||||||
client.execute(&set_dates_script, vec![]).await?;
|
client.execute(&set_dates_script, vec![]).await?;
|
||||||
sleep(Duration::from_millis(1000)).await; // Wait for table to update
|
sleep(Duration::from_millis(1000)).await;
|
||||||
|
|
||||||
// Now read the values
|
|
||||||
let from_date_value: String = client.execute(
|
|
||||||
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
|
|
||||||
vec![],
|
|
||||||
).await?.as_str().unwrap_or_default().to_string();
|
|
||||||
|
|
||||||
let to_date_value: String = client.execute(
|
|
||||||
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
|
|
||||||
vec![],
|
|
||||||
).await?.as_str().unwrap_or_default().to_string();
|
|
||||||
|
|
||||||
if from_date_value == start && to_date_value == end {
|
|
||||||
println!(" Dates set correctly");
|
|
||||||
} else {
|
|
||||||
println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}",
|
|
||||||
start, end, from_date_value, to_date_value);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_any_date(date: &str) -> Option<NaiveDate> {
|
fn parse_date(date: &str) -> Option<NaiveDate> {
|
||||||
// Attempt ISO first
|
NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d").ok()
|
||||||
if let Ok(d) = NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d") {
|
|
||||||
return Some(d);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert German → English once
|
|
||||||
let month_map = [
|
|
||||||
("Januar", "January"), ("Februar", "February"), ("März", "March"),
|
|
||||||
("April", "April"), ("Mai", "May"), ("Juni", "June"),
|
|
||||||
("Juli", "July"), ("August", "August"), ("September", "September"),
|
|
||||||
("Oktober", "October"), ("November", "November"), ("Dezember", "December"),
|
|
||||||
];
|
|
||||||
|
|
||||||
let mut english = date.to_string();
|
|
||||||
for (de, en) in month_map {
|
|
||||||
english = english.replace(de, en);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try two formats max
|
|
||||||
NaiveDate::parse_from_str(&english, "%A, %d. %B %Y")
|
|
||||||
.or_else(|_| NaiveDate::parse_from_str(&english, "%d. %B %Y"))
|
|
||||||
.ok()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String> {
|
fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String> {
|
||||||
let mut dates: Vec<_> = events
|
let mut dates: Vec<_> = events.iter().filter_map(|e| parse_date(&e.date)).collect();
|
||||||
.iter()
|
|
||||||
.filter_map(|e| parse_any_date(&e.date))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if dates.is_empty() {
|
if dates.is_empty() {
|
||||||
return Err(anyhow::anyhow!("No parseable dates found"));
|
return Err(anyhow::anyhow!("No parseable dates found"));
|
||||||
@@ -270,242 +254,220 @@ fn calculate_next_start_date(events: &[EconomicEvent]) -> anyhow::Result<String>
|
|||||||
Ok(next.format("%Y-%m-%d").to_string())
|
Ok(next.format("%Y-%m-%d").to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_month(date: &str) -> Option<String> {
|
/// Scan the economic_events directory for existing chunks
|
||||||
parse_any_date(date).map(|d| d.format("%B").to_string())
|
async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
||||||
}
|
let events_dir = PathBuf::from("economic_events");
|
||||||
|
|
||||||
fn count_valid_times(events: &[EconomicEvent]) -> usize {
|
// Create directory if it doesn't exist
|
||||||
events.iter().filter(|e| {
|
if !events_dir.exists() {
|
||||||
e.time.len() == 5 &&
|
fs::create_dir_all(&events_dir).await?;
|
||||||
e.time.as_bytes()[2] == b':' &&
|
println!("📁 Created economic_events directory");
|
||||||
e.time[..2].chars().all(|c| c.is_ascii_digit()) &&
|
return Ok(vec![]);
|
||||||
e.time[3..].chars().all(|c| c.is_ascii_digit())
|
|
||||||
}).count()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn missing_critical(e: &EconomicEvent) -> bool {
|
|
||||||
e.event.trim().is_empty() || e.time.trim().is_empty()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_complete(e: &EconomicEvent) -> bool {
|
|
||||||
!(e.event.trim().is_empty() ||
|
|
||||||
e.time.trim().is_empty() ||
|
|
||||||
e.country.trim().is_empty()) &&
|
|
||||||
(!e.actual.trim().is_empty() ||
|
|
||||||
!e.forecast.trim().is_empty() ||
|
|
||||||
!e.previous.trim().is_empty())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn check_data_consistency(events: &[EconomicEvent]) {
|
|
||||||
println!("\n=== DATA CONSISTENCY CHECKS ===");
|
|
||||||
|
|
||||||
println!("⏰ Valid time formats: {}/{}", count_valid_times(events), events.len());
|
|
||||||
|
|
||||||
let missing: Vec<_> = events.iter().enumerate()
|
|
||||||
.filter(|(_, e)| missing_critical(e))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if !missing.is_empty() {
|
|
||||||
println!("❌ {} events missing critical fields", missing.len());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> {
|
|
||||||
println!("\n=== EVENT VALIDATION ===");
|
|
||||||
|
|
||||||
if events.is_empty() {
|
|
||||||
println!("❌ ERROR: No events extracted!");
|
|
||||||
return Ok(());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("📊 Total events: {}", events.len());
|
let mut chunks = Vec::new();
|
||||||
|
let mut entries = fs::read_dir(&events_dir).await?;
|
||||||
|
|
||||||
// 1. Description coverage
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
let desc_count = events.iter()
|
let path = entry.path();
|
||||||
.filter(|e| !e.description.trim().is_empty())
|
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||||
.count();
|
if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
|
||||||
|
// Parse filename: chunk_{startdate}_{enddate}.json
|
||||||
println!("📝 Events with descriptions: {}/{}", desc_count, events.len());
|
if let Some(dates) = filename.strip_prefix("chunk_") {
|
||||||
|
let parts: Vec<&str> = dates.split('_').collect();
|
||||||
// 2. Distributions
|
if parts.len() == 2 {
|
||||||
use std::collections::HashMap;
|
// Load and count events
|
||||||
let mut country_dist: HashMap<String, usize> = HashMap::new();
|
if let Ok(content) = fs::read_to_string(&path).await {
|
||||||
let mut month_dist: HashMap<String, usize> = HashMap::new();
|
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||||
|
chunks.push(ChunkInfo {
|
||||||
for e in events {
|
start_date: parts[0].to_string(),
|
||||||
*country_dist.entry(e.country.clone()).or_insert(0) += 1;
|
end_date: parts[1].to_string(),
|
||||||
|
path: path.clone(),
|
||||||
if let Some(month) = extract_month(&e.date) {
|
event_count: events.len(),
|
||||||
*month_dist.entry(month).or_insert(0) += 1;
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("🌍 Country distribution: {:?}", country_dist);
|
chunks.sort_by(|a, b| a.start_date.cmp(&b.start_date));
|
||||||
println!("📈 Month distribution: {:?}", month_dist);
|
|
||||||
|
if !chunks.is_empty() {
|
||||||
// 3. Sample events (first 5)
|
println!("\n📊 Found {} existing chunks:", chunks.len());
|
||||||
println!("\n🔍 Sample events (first 5):");
|
for chunk in &chunks {
|
||||||
for event in events.iter().take(5) {
|
println!(" • {} to {} ({} events)",
|
||||||
println!(
|
chunk.start_date, chunk.end_date, chunk.event_count);
|
||||||
" • {} {}: {} - {} (Importance: {})",
|
}
|
||||||
event.date, event.time, event.country, event.event, event.importance
|
} else {
|
||||||
);
|
println!("📭 No existing chunks found");
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. Completeness check
|
Ok(chunks)
|
||||||
let complete_count = events.iter().filter(|e| is_complete(e)).count();
|
}
|
||||||
println!(
|
|
||||||
"✅ Complete events: {}/{}",
|
|
||||||
complete_count,
|
|
||||||
events.len()
|
|
||||||
);
|
|
||||||
|
|
||||||
// 5. Date range
|
/// Calculate target end date: first day of month, 3 months from now
|
||||||
let (earliest, latest) = calculate_actual_date_range(events);
|
fn calculate_target_end_date() -> String {
|
||||||
println!("📅 Actual date range: {} to {}", earliest, latest);
|
let now = chrono::Local::now().naive_local().date();
|
||||||
|
let three_months_ahead = if now.month() + 3 > 12 {
|
||||||
|
NaiveDate::from_ymd_opt(now.year() + 1, (now.month() + 3) % 12, 1)
|
||||||
|
} else {
|
||||||
|
NaiveDate::from_ymd_opt(now.year(), now.month() + 3, 1)
|
||||||
|
}.unwrap();
|
||||||
|
|
||||||
|
three_months_ahead.format("%Y-%m-%d").to_string()
|
||||||
|
}
|
||||||
|
|
||||||
// Final summary
|
/// Determine what date range needs to be scraped based on existing data
|
||||||
println!("\n=== VALIDATION SUMMARY ===");
|
fn determine_scrape_range(chunks: &[ChunkInfo], target_end: &str) -> Option<(String, String)> {
|
||||||
println!(" • Total events: {}", events.len());
|
let now = chrono::Local::now().naive_local().date().format("%Y-%m-%d").to_string();
|
||||||
println!(
|
|
||||||
" • Events with descriptions [%]: {}",
|
if chunks.is_empty() {
|
||||||
(desc_count * 100) / events.len().max(1)
|
// No data exists, start from beginning
|
||||||
);
|
println!("📭 No existing data - scraping from 2007-02-13 to {}", target_end);
|
||||||
println!(
|
return Some(("2007-02-13".to_string(), target_end.to_string()));
|
||||||
" • Complete events [%]: {}",
|
}
|
||||||
(complete_count * 100) / events.len().max(1)
|
|
||||||
);
|
// Find the latest date in existing chunks
|
||||||
println!(" • Date range: {} to {}", earliest, latest);
|
let latest_chunk_date = chunks.iter()
|
||||||
|
.map(|c| &c.end_date)
|
||||||
|
.max()
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_else(|| "2007-02-13".to_string());
|
||||||
|
|
||||||
|
println!("📊 Latest existing data: {}", latest_chunk_date);
|
||||||
|
|
||||||
|
if latest_chunk_date >= now {
|
||||||
|
// Data is ahead of current date - update from now to target
|
||||||
|
println!("🔄 Data exists beyond today - updating from {} to {}", now, target_end);
|
||||||
|
Some((now, target_end.to_string()))
|
||||||
|
} else {
|
||||||
|
// Data is behind - continue from where it left off
|
||||||
|
let next_start = parse_date(&latest_chunk_date)
|
||||||
|
.and_then(|d| d.succ_opt())
|
||||||
|
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
|
.unwrap_or_else(|| latest_chunk_date.clone());
|
||||||
|
|
||||||
|
println!("➡️ Continuing from {} to {}", next_start, target_end);
|
||||||
|
Some((next_start, target_end.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save a chunk to disk
|
||||||
|
async fn save_chunk(events: &[EconomicEvent], start: &str, end: &str) -> anyhow::Result<()> {
|
||||||
|
let events_dir = PathBuf::from("economic_events");
|
||||||
|
fs::create_dir_all(&events_dir).await?;
|
||||||
|
|
||||||
|
let filename = format!("chunk_{}_{}.json", start, end);
|
||||||
|
let filepath = events_dir.join(&filename);
|
||||||
|
|
||||||
|
let json = serde_json::to_string_pretty(events)?;
|
||||||
|
fs::write(&filepath, json).await?;
|
||||||
|
|
||||||
|
println!("💾 Saved chunk: {} ({} events)", filename, events.len());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) {
|
/// Load all events from existing chunks
|
||||||
if events.is_empty() {
|
async fn load_all_events(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
return ("No data".to_string(), "No data".to_string());
|
let mut all_events = Vec::new();
|
||||||
|
|
||||||
|
for chunk in chunks {
|
||||||
|
if let Ok(content) = fs::read_to_string(&chunk.path).await {
|
||||||
|
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||||
|
all_events.extend(events);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut dates: Vec<NaiveDate> = events
|
println!("📥 Loaded {} events from existing chunks", all_events.len());
|
||||||
.iter()
|
Ok(all_events)
|
||||||
.filter_map(|e| {
|
|
||||||
// Parse German date format "Dienstag, 2. Januar 2024"
|
|
||||||
extract_date_from_german_format(&e.date)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
dates.sort();
|
|
||||||
|
|
||||||
let earliest = dates.first().map(|d| d.format("%Y-%m-%d").to_string())
|
|
||||||
.unwrap_or_else(|| "Unknown".to_string());
|
|
||||||
let latest = dates.last().map(|d| d.format("%Y-%m-%d").to_string())
|
|
||||||
.unwrap_or_else(|| "Unknown".to_string());
|
|
||||||
|
|
||||||
(earliest, latest)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_date_from_german_format(german_date: &str) -> Option<NaiveDate> {
|
/// Scrape events for a specific date range and save chunks immediately
|
||||||
// Map German month names to English
|
async fn scrape_date_range(
|
||||||
let month_map = [
|
client: &fantoccini::Client,
|
||||||
("Januar", "January"),
|
start: &str,
|
||||||
("Februar", "February"),
|
end: &str,
|
||||||
("März", "March"),
|
) -> anyhow::Result<()> {
|
||||||
("April", "April"),
|
println!("\n🎯 Scraping range: {} to {}", start, end);
|
||||||
("Mai", "May"),
|
|
||||||
("Juni", "June"),
|
|
||||||
("Juli", "July"),
|
|
||||||
("August", "August"),
|
|
||||||
("September", "September"),
|
|
||||||
("Oktober", "October"),
|
|
||||||
("November", "November"),
|
|
||||||
("Dezember", "December"),
|
|
||||||
];
|
|
||||||
|
|
||||||
let mut english_date = german_date.to_string();
|
let mut current_start = start.to_string();
|
||||||
for (de, en) in &month_map {
|
let mut chunk_number = 0;
|
||||||
english_date = english_date.replace(de, en);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse "Tuesday, 2. January 2024" format
|
|
||||||
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn scrape_all_events_with_chunking(
|
|
||||||
client: &fantoccini::Client,
|
|
||||||
start_date: &str,
|
|
||||||
end_date: &str
|
|
||||||
) -> anyhow::Result<Vec<EconomicEvent>> {
|
|
||||||
let json_export_now = chrono::Local::now().format("%Y%m%d_%H%M%S");
|
|
||||||
|
|
||||||
let mut all_events: Vec<EconomicEvent> = Vec::new();
|
|
||||||
let mut current_start = start_date.to_string();
|
|
||||||
let mut attempts = 0;
|
|
||||||
let max_attempts = 300;
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
attempts += 1;
|
set_date_range(client, ¤t_start, end).await?;
|
||||||
if attempts > max_attempts {
|
|
||||||
println!("⚠️ Reached maximum attempts ({})", max_attempts);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("🚀 Chunk {}: {} to {}", attempts, current_start, end_date);
|
|
||||||
|
|
||||||
set_date_range(client, ¤t_start, end_date).await?;
|
|
||||||
sleep(Duration::from_secs(3)).await;
|
sleep(Duration::from_secs(3)).await;
|
||||||
|
|
||||||
let chunk = extract_all_data_via_js(client).await?;
|
let events = extract_all_data_via_js(client).await?;
|
||||||
if chunk.is_empty() {
|
if events.is_empty() {
|
||||||
println!("✅ No more events found. Completed!");
|
println!(" ✅ No more events in this range");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("📊 Chunk {}: {} events (Total: {})",
|
chunk_number += 1;
|
||||||
attempts, chunk.len(), all_events.len() + chunk.len());
|
println!(" 📦 Fetched {} events", events.len());
|
||||||
|
|
||||||
all_events.extend(chunk.clone());
|
// Calculate actual date range of this chunk
|
||||||
|
let chunk_start = events.iter()
|
||||||
|
.filter_map(|e| parse_date(&e.date))
|
||||||
|
.min()
|
||||||
|
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
|
.unwrap_or_else(|| current_start.clone());
|
||||||
|
|
||||||
|
let chunk_end = events.iter()
|
||||||
|
.filter_map(|e| parse_date(&e.date))
|
||||||
|
.max()
|
||||||
|
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
|
.unwrap_or_else(|| end.to_string());
|
||||||
|
|
||||||
let next = match calculate_next_start_date(&chunk) {
|
// Save chunk immediately
|
||||||
|
save_chunk(&events, &chunk_start, &chunk_end).await?;
|
||||||
|
|
||||||
|
let next = match calculate_next_start_date(&events) {
|
||||||
Ok(n) => n,
|
Ok(n) => n,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
println!("❌ Could not calculate next start date. Stopping.");
|
println!(" ⚠️ Cannot calculate next date, stopping");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if next > end_date.to_string() {
|
if next > end.to_string() {
|
||||||
println!("✅ Reached end date. Completed!");
|
println!(" ✅ Reached end of range");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
current_start = next;
|
current_start = next;
|
||||||
|
|
||||||
sleep(Duration::from_secs(2)).await;
|
sleep(Duration::from_secs(2)).await;
|
||||||
|
|
||||||
export_chunk(&chunk, attempts, json_export_now.to_string().clone()).await?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove duplicates
|
Ok(())
|
||||||
let initial_count = all_events.len();
|
|
||||||
all_events.sort_by(|a, b| {
|
|
||||||
a.date.cmp(&b.date)
|
|
||||||
.then(a.time.cmp(&b.time))
|
|
||||||
.then(a.event.cmp(&b.event))
|
|
||||||
});
|
|
||||||
all_events.dedup_by(|a, b| {
|
|
||||||
a.date == b.date && a.time == b.time && a.event == b.event
|
|
||||||
});
|
|
||||||
|
|
||||||
println!("🎯 FINAL: Collected {} unique events (removed {} duplicates)",
|
|
||||||
all_events.len(), initial_count - all_events.len());
|
|
||||||
|
|
||||||
Ok(all_events)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn export_chunk(chunk: &[EconomicEvent], n: usize, ts: String) -> anyhow::Result<()> {
|
/// Main scraping logic with persistent storage
|
||||||
let filename = format!("economic_events_{}_chunk_{}.json", ts, n);
|
async fn scrape_with_persistence(
|
||||||
let json = serde_json::to_string_pretty(chunk)?;
|
client: &fantoccini::Client,
|
||||||
tokio::fs::write(&filename, json).await?;
|
) -> anyhow::Result<()> {
|
||||||
println!(" Chunk data exported to: {}", filename);
|
// Calculate target end date (3 months ahead, 1st of month)
|
||||||
|
let target_end = calculate_target_end_date();
|
||||||
|
println!("🎯 Target end date: {}", target_end);
|
||||||
|
|
||||||
|
// Scan for existing chunks
|
||||||
|
let existing_chunks = scan_existing_chunks().await?;
|
||||||
|
|
||||||
|
// Determine what range needs to be scraped
|
||||||
|
let scrape_range = determine_scrape_range(&existing_chunks, &target_end);
|
||||||
|
|
||||||
|
if let Some((start, end)) = scrape_range {
|
||||||
|
// Scrape the needed range (saves chunks automatically)
|
||||||
|
scrape_date_range(client, &start, &end).await?;
|
||||||
|
println!("\n✅ Scraping complete!");
|
||||||
|
} else {
|
||||||
|
println!("✅ All data is up to date!");
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -515,20 +477,20 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let mut chromedriver = start_chromedriver(port);
|
let mut chromedriver = start_chromedriver(port);
|
||||||
sleep(Duration::from_secs(1)).await;
|
sleep(Duration::from_secs(1)).await;
|
||||||
|
|
||||||
// Chrome options
|
|
||||||
let caps_value = serde_json::json!({
|
let caps_value = serde_json::json!({
|
||||||
"goog:chromeOptions": {
|
"goog:chromeOptions": {
|
||||||
"args": [
|
"args": [
|
||||||
"--disable-gpu",
|
"--disable-gpu",
|
||||||
"--disable-notifications",
|
"--disable-notifications",
|
||||||
"--disable-popup-blocking",
|
"--disable-popup-blocking",
|
||||||
"--disable-blink-features=AutomationControlled"
|
"--disable-blink-features=AutomationControlled",
|
||||||
],
|
],
|
||||||
"excludeSwitches": ["enable-automation"]
|
"excludeSwitches": ["enable-automation"]
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let caps_map: Map<String, Value> = caps_value.as_object()
|
let caps_map: Map<String, Value> = caps_value
|
||||||
|
.as_object()
|
||||||
.expect("Capabilities should be a JSON object")
|
.expect("Capabilities should be a JSON object")
|
||||||
.clone();
|
.clone();
|
||||||
|
|
||||||
@@ -537,48 +499,60 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.connect(&format!("http://localhost:{}", port))
|
.connect(&format!("http://localhost:{}", port))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// Setup graceful shutdown on Ctrl+C
|
// Setup graceful shutdown
|
||||||
let shutdown_client = client.clone();
|
let shutdown_client = client.clone();
|
||||||
let shutdown_handle = tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
signal::ctrl_c().await.expect("Failed to listen for ctrl+c");
|
signal::ctrl_c()
|
||||||
|
.await
|
||||||
|
.expect("Failed to listen for ctrl+c");
|
||||||
println!("\nCtrl+C received, shutting down...");
|
println!("\nCtrl+C received, shutting down...");
|
||||||
shutdown_client.close().await.ok();
|
shutdown_client.close().await.ok();
|
||||||
chromedriver.kill().ok();
|
|
||||||
std::process::exit(0);
|
std::process::exit(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Go to page
|
// Navigate to page
|
||||||
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
||||||
client.goto(url).await?;
|
client.goto(url).await?;
|
||||||
|
|
||||||
// Dismiss overlays
|
|
||||||
dismiss_overlays(&client).await?;
|
dismiss_overlays(&client).await?;
|
||||||
|
|
||||||
// Click the high importance tab
|
// Click high importance tab
|
||||||
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
if let Ok(tab) = client
|
||||||
|
.find(Locator::Css(
|
||||||
|
r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
{
|
||||||
tab.click().await?;
|
tab.click().await?;
|
||||||
println!("High importance tab clicked");
|
println!("✓ High importance tab clicked");
|
||||||
sleep(Duration::from_secs(2)).await;
|
sleep(Duration::from_secs(2)).await;
|
||||||
} else {
|
|
||||||
println!("High importance tab not found");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use chunking to extract all events across the entire date range
|
// Run persistent scraping
|
||||||
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
scrape_with_persistence(&client).await?;
|
||||||
|
|
||||||
// Run validation suite
|
// Load and display summary
|
||||||
validate_events(&events).await?;
|
let chunks = scan_existing_chunks().await?;
|
||||||
check_data_consistency(&events).await;
|
let all_events = load_all_events(&chunks).await?;
|
||||||
|
|
||||||
// Export for further analysis
|
println!("\n📊 FINAL SUMMARY:");
|
||||||
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
println!(" • Total chunks: {}", chunks.len());
|
||||||
let filename = format!("economic_events_{}_combined.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));
|
println!(" • Total events: {}", all_events.len());
|
||||||
tokio::fs::write(&filename, json).await?;
|
|
||||||
println!(" • Combined data exported to: {}", filename);
|
if !chunks.is_empty() {
|
||||||
|
let dates: Vec<String> = all_events.iter()
|
||||||
|
.filter_map(|e| parse_date(&e.date))
|
||||||
|
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
|
.collect();
|
||||||
|
if !dates.is_empty() {
|
||||||
|
let min = dates.iter().min().unwrap();
|
||||||
|
let max = dates.iter().max().unwrap();
|
||||||
|
println!(" • Date range: {} to {}", min, max);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for Ctrl+C
|
client.close().await?;
|
||||||
shutdown_handle.await.ok();
|
chromedriver.kill()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user