cleaned up main
This commit is contained in:
3588
economic_events_20251116_185735.json
Normal file
3588
economic_events_20251116_185735.json
Normal file
File diff suppressed because it is too large
Load Diff
297
src/main.rs
297
src/main.rs
@@ -1,3 +1,4 @@
|
|||||||
|
use chrono::{NaiveDate, Duration as ChronoDuration};
|
||||||
use fantoccini::{ClientBuilder, Locator};
|
use fantoccini::{ClientBuilder, Locator};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
@@ -328,6 +329,173 @@ fn extract_month(date_str: &str) -> Option<String> {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) {
|
||||||
|
if events.is_empty() {
|
||||||
|
return ("No data".to_string(), "No data".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut dates: Vec<NaiveDate> = events
|
||||||
|
.iter()
|
||||||
|
.filter_map(|e| {
|
||||||
|
// Parse German date format "Dienstag, 2. Januar 2024"
|
||||||
|
extract_date_from_german_format(&e.date)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
dates.sort();
|
||||||
|
|
||||||
|
let earliest = dates.first().map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
|
.unwrap_or_else(|| "Unknown".to_string());
|
||||||
|
let latest = dates.last().map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
|
.unwrap_or_else(|| "Unknown".to_string());
|
||||||
|
|
||||||
|
(earliest, latest)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_date_from_german_format(german_date: &str) -> Option<NaiveDate> {
|
||||||
|
// Map German month names to English
|
||||||
|
let month_map = [
|
||||||
|
("Januar", "January"),
|
||||||
|
("Februar", "February"),
|
||||||
|
("März", "March"),
|
||||||
|
("April", "April"),
|
||||||
|
("Mai", "May"),
|
||||||
|
("Juni", "June"),
|
||||||
|
("Juli", "July"),
|
||||||
|
("August", "August"),
|
||||||
|
("September", "September"),
|
||||||
|
("Oktober", "October"),
|
||||||
|
("November", "November"),
|
||||||
|
("Dezember", "December"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut english_date = german_date.to_string();
|
||||||
|
for (de, en) in &month_map {
|
||||||
|
english_date = english_date.replace(de, en);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse "Tuesday, 2. January 2024" format
|
||||||
|
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn calculate_next_start_date(events: &[EconomicEvent]) -> String {
|
||||||
|
// Find the latest date in the extracted events
|
||||||
|
let latest_date = events
|
||||||
|
.iter()
|
||||||
|
.filter_map(|e| extract_date_from_german_format(&e.date))
|
||||||
|
.max();
|
||||||
|
|
||||||
|
if let Some(latest) = latest_date {
|
||||||
|
// Start from the day after the latest extracted date
|
||||||
|
(latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string()
|
||||||
|
} else {
|
||||||
|
// Fallback: use current date
|
||||||
|
chrono::Local::now().format("%Y-%m-%d").to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn scrape_all_events_with_chunking(
|
||||||
|
client: &fantoccini::Client,
|
||||||
|
start_date: &str,
|
||||||
|
end_date: &str
|
||||||
|
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
|
let mut all_events = Vec::new();
|
||||||
|
let mut current_start = start_date.to_string();
|
||||||
|
let max_attempts = 50; // Prevent infinite loops
|
||||||
|
|
||||||
|
for attempt in 0..max_attempts {
|
||||||
|
println!("🚀 Chunk {}: {} to {}", attempt + 1, current_start, end_date);
|
||||||
|
|
||||||
|
// Set dates for current chunk
|
||||||
|
set_date_range(client, ¤t_start, end_date).await?;
|
||||||
|
|
||||||
|
// Extract events
|
||||||
|
let chunk_events = extract_all_data_via_js(client).await?;
|
||||||
|
|
||||||
|
if chunk_events.is_empty() {
|
||||||
|
println!("✅ No more events found. Completed!");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to total
|
||||||
|
all_events.extend(chunk_events.clone());
|
||||||
|
|
||||||
|
println!("📊 Chunk {}: {} events (Total: {})",
|
||||||
|
attempt + 1, chunk_events.len(), all_events.len());
|
||||||
|
|
||||||
|
// Check if we hit the limit and need to continue
|
||||||
|
if chunk_events.len() < 240 {
|
||||||
|
println!("✅ Reached end of data. Completed!");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate next start date
|
||||||
|
let next_start = calculate_next_start_date(&chunk_events);
|
||||||
|
if next_start > end_date.to_string() {
|
||||||
|
println!("✅ Reached end date. Completed!");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
current_start = next_start;
|
||||||
|
|
||||||
|
// Small delay between requests to be polite
|
||||||
|
sleep(Duration::from_secs(2)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove duplicates (in case of overlapping chunks)
|
||||||
|
all_events.dedup_by(|a, b| {
|
||||||
|
a.date == b.date && a.time == b.time && a.event == b.event
|
||||||
|
});
|
||||||
|
|
||||||
|
println!("🎯 FINAL: Collected {} unique events", all_events.len());
|
||||||
|
Ok(all_events)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||||
|
let set_dates_script = format!(r#"
|
||||||
|
(() => {{
|
||||||
|
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
||||||
|
const toInput = document.querySelector('#dtTeletraderEndDate');
|
||||||
|
|
||||||
|
if (fromInput) {{
|
||||||
|
fromInput.value = '{}';
|
||||||
|
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||||
|
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||||
|
}}
|
||||||
|
|
||||||
|
if (toInput) {{
|
||||||
|
toInput.value = '{}';
|
||||||
|
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||||
|
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||||
|
}}
|
||||||
|
|
||||||
|
return !!fromInput && !!toInput;
|
||||||
|
}})()
|
||||||
|
"#, start, end);
|
||||||
|
|
||||||
|
client.execute(&set_dates_script, vec![]).await?;
|
||||||
|
sleep(Duration::from_millis(1000)).await; // Wait for table to update
|
||||||
|
|
||||||
|
// Now read the values
|
||||||
|
let from_date_value: String = client.execute(
|
||||||
|
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
|
||||||
|
vec![],
|
||||||
|
).await?.as_str().unwrap_or_default().to_string();
|
||||||
|
|
||||||
|
let to_date_value: String = client.execute(
|
||||||
|
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
|
||||||
|
vec![],
|
||||||
|
).await?.as_str().unwrap_or_default().to_string();
|
||||||
|
|
||||||
|
if from_date_value == start && to_date_value == end {
|
||||||
|
println!("Dates set correctly");
|
||||||
|
} else {
|
||||||
|
println!("Date not set correctly");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let port = 9515; // pick a port you like
|
let port = 9515; // pick a port you like
|
||||||
@@ -371,130 +539,37 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
||||||
client.goto(url).await?;
|
client.goto(url).await?;
|
||||||
|
|
||||||
// Set start and end dates
|
// Use chunking to extract all events across the entire date range
|
||||||
let start_date = "2024-01-01";
|
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
||||||
let end_date = "2025-01-01";
|
|
||||||
|
|
||||||
let set_dates_script = format!(r#"
|
|
||||||
(() => {{
|
|
||||||
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
|
||||||
const toInput = document.querySelector('#dtTeletraderEndDate');
|
|
||||||
|
|
||||||
if (fromInput) {{
|
|
||||||
fromInput.value = '{}';
|
|
||||||
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
|
||||||
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
|
||||||
}}
|
|
||||||
|
|
||||||
if (toInput) {{
|
|
||||||
toInput.value = '{}';
|
|
||||||
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
|
||||||
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
|
||||||
}}
|
|
||||||
|
|
||||||
return !!fromInput && !!toInput;
|
|
||||||
}})()
|
|
||||||
"#, start_date, end_date);
|
|
||||||
|
|
||||||
// Execute JS to set dates and get the raw response
|
|
||||||
let _ = client.execute(&set_dates_script, vec![]).await;
|
|
||||||
|
|
||||||
// Give React time to process
|
|
||||||
sleep(Duration::from_millis(500)).await;
|
|
||||||
|
|
||||||
// Now read the values
|
|
||||||
let from_date_value: String = client.execute(
|
|
||||||
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
|
|
||||||
vec![],
|
|
||||||
).await?.as_str().unwrap_or_default().to_string();
|
|
||||||
|
|
||||||
let to_date_value: String = client.execute(
|
|
||||||
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
|
|
||||||
vec![],
|
|
||||||
).await?.as_str().unwrap_or_default().to_string();
|
|
||||||
|
|
||||||
println!("From Date: {}", from_date_value);
|
|
||||||
println!("To Date: {}", to_date_value);
|
|
||||||
|
|
||||||
if from_date_value == start_date && to_date_value == end_date {
|
|
||||||
println!("Dates set correctly");
|
|
||||||
} else {
|
|
||||||
println!("Date not set correctly");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find all table rows
|
|
||||||
let rows = client.find_all(Locator::Css(
|
|
||||||
"#TeletraderForm table.table tbody tr"
|
|
||||||
)).await?;
|
|
||||||
|
|
||||||
println!("Found {} table rows", rows.len());
|
|
||||||
|
|
||||||
// HashMap to store "Termin" -> description
|
|
||||||
let mut event_type_map: HashMap<String, String> = HashMap::new();
|
|
||||||
|
|
||||||
let mut i = 0;
|
|
||||||
while i < rows.len() {
|
|
||||||
let row = &rows[i];
|
|
||||||
|
|
||||||
// Extract all cells
|
|
||||||
let cells = row.find_all(Locator::Css("td")).await?;
|
|
||||||
|
|
||||||
if cells.len() >= 5 {
|
|
||||||
// Get Termin column text
|
|
||||||
let termin_text = cells[4].text().await.unwrap_or_default();
|
|
||||||
|
|
||||||
// Check if next row is a hidden description row
|
|
||||||
if i + 1 < rows.len() {
|
|
||||||
let next_row = &rows[i + 1];
|
|
||||||
let class = next_row.attr("class").await.unwrap_or(None).unwrap_or_default();
|
|
||||||
if class.starts_with("table__td teletrader") {
|
|
||||||
// Get the hidden description
|
|
||||||
let desc_cell = next_row.find(Locator::Css("td")).await?;
|
|
||||||
let desc_text = desc_cell.text().await.unwrap_or_default();
|
|
||||||
event_type_map.insert(termin_text.clone(), desc_text);
|
|
||||||
i += 1; // skip next row since it's the hidden description
|
|
||||||
} else {
|
|
||||||
event_type_map.insert(termin_text.clone(), "".to_string());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
event_type_map.insert(termin_text.clone(), "".to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract using JavaScript
|
|
||||||
let events = extract_all_data_via_js(&client).await?;
|
|
||||||
|
|
||||||
// Extract descriptions using JavaScript
|
|
||||||
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
|
||||||
|
|
||||||
// Merge descriptions with events
|
|
||||||
let events_with_descriptions: Vec<EconomicEvent> = events.clone().into_iter()
|
|
||||||
.map(|mut event| {
|
|
||||||
if let Some(description) = event_type_map.get(&event.event) {
|
|
||||||
event.description = description.clone();
|
|
||||||
}
|
|
||||||
event
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// Run validation suite
|
// Run validation suite
|
||||||
validate_events(&events).await?;
|
validate_events(&events).await?;
|
||||||
check_data_consistency(&events).await;
|
check_data_consistency(&events).await;
|
||||||
|
|
||||||
|
// Calculate actual date range from extracted data
|
||||||
|
let actual_date_range = calculate_actual_date_range(&events);
|
||||||
|
let current_date = chrono::Local::now().format("%Y-%m-%d").to_string();
|
||||||
|
|
||||||
// Final summary
|
// Final summary
|
||||||
println!("\n🎯 EXTRACTION SUMMARY:");
|
println!("\n🎯 EXTRACTION SUMMARY:");
|
||||||
println!(" • Total high-importance events: {}", events.len());
|
println!(" • Total high-importance events: {}", events.len());
|
||||||
println!(" • Date range: 2024-01-01 to 2025-01-01");
|
println!(" • Requested range: 2024-01-01 to 2025-01-01");
|
||||||
|
println!(" • Actual extracted range: {} to {}", actual_date_range.0, actual_date_range.1);
|
||||||
|
println!(" • Data extracted until: {}", current_date);
|
||||||
println!(" • Data quality: {}% complete",
|
println!(" • Data quality: {}% complete",
|
||||||
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
||||||
|
|
||||||
|
// Check if we need more runs
|
||||||
|
if events.len() >= 240 {
|
||||||
|
println!("⚠️ WARNING: Hit maximum events limit (240). Need multiple runs to get all data.");
|
||||||
|
println!(" • Next run should start from: {}", calculate_next_start_date(&events));
|
||||||
|
}
|
||||||
|
|
||||||
// Export for further analysis
|
// Export for further analysis
|
||||||
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
||||||
tokio::fs::write("economic_events.json", json).await?;
|
let filename = format!("economic_events_{}.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));
|
||||||
println!(" • Data exported to: economic_events.json");
|
tokio::fs::write(&filename, json).await?;
|
||||||
|
println!(" • Data exported to: {}", filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for Ctrl+C
|
// Wait for Ctrl+C
|
||||||
|
|||||||
Reference in New Issue
Block a user