cleaned up main

This commit is contained in:
2025-11-16 19:18:20 +01:00
parent 67ecc1e89a
commit 3df871f69f
2 changed files with 3777 additions and 114 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,4 @@
use chrono::{NaiveDate, Duration as ChronoDuration};
use fantoccini::{ClientBuilder, Locator};
use serde::Serialize;
use serde_json::{Map, Value};
@@ -328,6 +329,173 @@ fn extract_month(date_str: &str) -> Option<String> {
None
}
fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) {
if events.is_empty() {
return ("No data".to_string(), "No data".to_string());
}
let mut dates: Vec<NaiveDate> = events
.iter()
.filter_map(|e| {
// Parse German date format "Dienstag, 2. Januar 2024"
extract_date_from_german_format(&e.date)
})
.collect();
dates.sort();
let earliest = dates.first().map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| "Unknown".to_string());
let latest = dates.last().map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| "Unknown".to_string());
(earliest, latest)
}
fn extract_date_from_german_format(german_date: &str) -> Option<NaiveDate> {
// Map German month names to English
let month_map = [
("Januar", "January"),
("Februar", "February"),
("März", "March"),
("April", "April"),
("Mai", "May"),
("Juni", "June"),
("Juli", "July"),
("August", "August"),
("September", "September"),
("Oktober", "October"),
("November", "November"),
("Dezember", "December"),
];
let mut english_date = german_date.to_string();
for (de, en) in &month_map {
english_date = english_date.replace(de, en);
}
// Parse "Tuesday, 2. January 2024" format
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
}
fn calculate_next_start_date(events: &[EconomicEvent]) -> String {
// Find the latest date in the extracted events
let latest_date = events
.iter()
.filter_map(|e| extract_date_from_german_format(&e.date))
.max();
if let Some(latest) = latest_date {
// Start from the day after the latest extracted date
(latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string()
} else {
// Fallback: use current date
chrono::Local::now().format("%Y-%m-%d").to_string()
}
}
async fn scrape_all_events_with_chunking(
client: &fantoccini::Client,
start_date: &str,
end_date: &str
) -> anyhow::Result<Vec<EconomicEvent>> {
let mut all_events = Vec::new();
let mut current_start = start_date.to_string();
let max_attempts = 50; // Prevent infinite loops
for attempt in 0..max_attempts {
println!("🚀 Chunk {}: {} to {}", attempt + 1, current_start, end_date);
// Set dates for current chunk
set_date_range(client, &current_start, end_date).await?;
// Extract events
let chunk_events = extract_all_data_via_js(client).await?;
if chunk_events.is_empty() {
println!("✅ No more events found. Completed!");
break;
}
// Add to total
all_events.extend(chunk_events.clone());
println!("📊 Chunk {}: {} events (Total: {})",
attempt + 1, chunk_events.len(), all_events.len());
// Check if we hit the limit and need to continue
if chunk_events.len() < 240 {
println!("✅ Reached end of data. Completed!");
break;
}
// Calculate next start date
let next_start = calculate_next_start_date(&chunk_events);
if next_start > end_date.to_string() {
println!("✅ Reached end date. Completed!");
break;
}
current_start = next_start;
// Small delay between requests to be polite
sleep(Duration::from_secs(2)).await;
}
// Remove duplicates (in case of overlapping chunks)
all_events.dedup_by(|a, b| {
a.date == b.date && a.time == b.time && a.event == b.event
});
println!("🎯 FINAL: Collected {} unique events", all_events.len());
Ok(all_events)
}
async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> {
let set_dates_script = format!(r#"
(() => {{
const fromInput = document.querySelector('#dtTeletraderFromDate');
const toInput = document.querySelector('#dtTeletraderEndDate');
if (fromInput) {{
fromInput.value = '{}';
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}}
if (toInput) {{
toInput.value = '{}';
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}}
return !!fromInput && !!toInput;
}})()
"#, start, end);
client.execute(&set_dates_script, vec![]).await?;
sleep(Duration::from_millis(1000)).await; // Wait for table to update
// Now read the values
let from_date_value: String = client.execute(
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
vec![],
).await?.as_str().unwrap_or_default().to_string();
let to_date_value: String = client.execute(
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
vec![],
).await?.as_str().unwrap_or_default().to_string();
if from_date_value == start && to_date_value == end {
println!("Dates set correctly");
} else {
println!("Date not set correctly");
}
Ok(())
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let port = 9515; // pick a port you like
@@ -371,130 +539,37 @@ async fn main() -> anyhow::Result<()> {
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
client.goto(url).await?;
// Set start and end dates
let start_date = "2024-01-01";
let end_date = "2025-01-01";
let set_dates_script = format!(r#"
(() => {{
const fromInput = document.querySelector('#dtTeletraderFromDate');
const toInput = document.querySelector('#dtTeletraderEndDate');
if (fromInput) {{
fromInput.value = '{}';
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}}
if (toInput) {{
toInput.value = '{}';
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}}
return !!fromInput && !!toInput;
}})()
"#, start_date, end_date);
// Execute JS to set dates and get the raw response
let _ = client.execute(&set_dates_script, vec![]).await;
// Give React time to process
sleep(Duration::from_millis(500)).await;
// Now read the values
let from_date_value: String = client.execute(
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
vec![],
).await?.as_str().unwrap_or_default().to_string();
let to_date_value: String = client.execute(
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
vec![],
).await?.as_str().unwrap_or_default().to_string();
println!("From Date: {}", from_date_value);
println!("To Date: {}", to_date_value);
if from_date_value == start_date && to_date_value == end_date {
println!("Dates set correctly");
} else {
println!("Date not set correctly");
}
// Find all table rows
let rows = client.find_all(Locator::Css(
"#TeletraderForm table.table tbody tr"
)).await?;
println!("Found {} table rows", rows.len());
// HashMap to store "Termin" -> description
let mut event_type_map: HashMap<String, String> = HashMap::new();
let mut i = 0;
while i < rows.len() {
let row = &rows[i];
// Extract all cells
let cells = row.find_all(Locator::Css("td")).await?;
if cells.len() >= 5 {
// Get Termin column text
let termin_text = cells[4].text().await.unwrap_or_default();
// Check if next row is a hidden description row
if i + 1 < rows.len() {
let next_row = &rows[i + 1];
let class = next_row.attr("class").await.unwrap_or(None).unwrap_or_default();
if class.starts_with("table__td teletrader") {
// Get the hidden description
let desc_cell = next_row.find(Locator::Css("td")).await?;
let desc_text = desc_cell.text().await.unwrap_or_default();
event_type_map.insert(termin_text.clone(), desc_text);
i += 1; // skip next row since it's the hidden description
} else {
event_type_map.insert(termin_text.clone(), "".to_string());
}
} else {
event_type_map.insert(termin_text.clone(), "".to_string());
}
}
i += 1;
}
// Extract using JavaScript
let events = extract_all_data_via_js(&client).await?;
// Extract descriptions using JavaScript
let event_type_map = extract_event_descriptions_via_js(&client).await?;
// Merge descriptions with events
let events_with_descriptions: Vec<EconomicEvent> = events.clone().into_iter()
.map(|mut event| {
if let Some(description) = event_type_map.get(&event.event) {
event.description = description.clone();
}
event
})
.collect();
// Use chunking to extract all events across the entire date range
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
// Run validation suite
validate_events(&events).await?;
check_data_consistency(&events).await;
// Calculate actual date range from extracted data
let actual_date_range = calculate_actual_date_range(&events);
let current_date = chrono::Local::now().format("%Y-%m-%d").to_string();
// Final summary
println!("\n🎯 EXTRACTION SUMMARY:");
println!(" • Total high-importance events: {}", events.len());
println!("Date range: 2024-01-01 to 2025-01-01");
println!("Requested range: 2024-01-01 to 2025-01-01");
println!(" • Actual extracted range: {} to {}", actual_date_range.0, actual_date_range.1);
println!(" • Data extracted until: {}", current_date);
println!(" • Data quality: {}% complete",
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
// Check if we need more runs
if events.len() >= 240 {
println!("⚠️ WARNING: Hit maximum events limit (240). Need multiple runs to get all data.");
println!(" • Next run should start from: {}", calculate_next_start_date(&events));
}
// Export for further analysis
if let Ok(json) = serde_json::to_string_pretty(&events) {
tokio::fs::write("economic_events.json", json).await?;
println!(" • Data exported to: economic_events.json");
let filename = format!("economic_events_{}.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));
tokio::fs::write(&filename, json).await?;
println!(" • Data exported to: {}", filename);
}
// Wait for Ctrl+C