Compare commits
4 Commits
67ecc1e89a
...
a44e22df0b
| Author | SHA1 | Date | |
|---|---|---|---|
| a44e22df0b | |||
| b8c98163da | |||
| 6302c8749a | |||
| 3df871f69f |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -23,4 +23,4 @@ target/
|
||||
/target
|
||||
|
||||
/chromedriver-win64/*
|
||||
/economic_events.json
|
||||
/economic_events*
|
||||
39
Cargo.lock
generated
39
Cargo.lock
generated
@@ -10,11 +10,21 @@ dependencies = [
|
||||
"chrono",
|
||||
"fantoccini",
|
||||
"futures",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
@@ -819,6 +829,35 @@ dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "1.1.2"
|
||||
|
||||
@@ -10,4 +10,5 @@ serde = { version = "1", features = ["derive"] }
|
||||
anyhow = "1.0"
|
||||
futures = "0.3"
|
||||
serde_json = "1.0"
|
||||
chrono = "0.4.42"
|
||||
chrono = "0.4.42"
|
||||
regex = "1.0"
|
||||
247
README.md
247
README.md
@@ -1,4 +1,249 @@
|
||||
# WebScraper
|
||||
# WebScraper — Wirtschaftskalender Datenextraktion
|
||||
|
||||
Ein leistungsstarker Web-Scraper in **Rust**, der hochwichtige Wirtschaftsereignisse von **finanzen.net** extrahiert und analysiert.
|
||||
|
||||
---
|
||||
|
||||
## 📋 Projektübersicht
|
||||
|
||||
Dieses Tool automatisiert die Extraktion von Wirtschaftsdaten aus dem Finanzen.net Wirtschaftskalender, mit besonderem Fokus auf hochwichtige Ereignisse (3 gelbe Sterne). Die extrahierten Daten werden in strukturiertem JSON-Format gespeichert und umfassen umfangreiche Metadaten für weitere Analysen.
|
||||
|
||||
---
|
||||
|
||||
## ✨ Hauptfunktionen
|
||||
|
||||
* **Selektive Extraktion:** Fokussiert ausschließlich auf hochwichtige Wirtschaftsereignisse (3 gelbe Sterne).
|
||||
* **Intelligentes Chunking:** Automatische Aufteilung großer Datumsbereiche in handhabbare Blöcke.
|
||||
* **Robuste Datumsverarbeitung:** Unterstützung für deutsche und internationale Datumsformate.
|
||||
* **Datenkonsistenzprüfung:** Umfassende Validierung der extrahierten Daten.
|
||||
* **Duplikaterkennung:** Automatische Erkennung und Entfernung doppelter Einträge.
|
||||
* **Graceful Shutdown:** Elegante Behandlung von Abbruchsignalen (Ctrl+C).
|
||||
* **Echtzeit-Export:** Parallele Speicherung von Zwischen- und Endergebnissen.
|
||||
|
||||
---
|
||||
|
||||
## 🛠 Technischer Stack
|
||||
|
||||
* **Programmiersprache:** Rust
|
||||
* **Web Automation:** Fantoccini (WebDriver Client)
|
||||
* **Datum/Zeit:** Chrono
|
||||
* **JSON-Verarbeitung:** Serde, serde_json
|
||||
* **Asynchrone Verarbeitung:** Tokio
|
||||
* **Browser-Treiber:** ChromeDriver
|
||||
|
||||
---
|
||||
|
||||
## 📁 Projektstruktur
|
||||
|
||||
```
|
||||
WebScraper/
|
||||
├── src/
|
||||
│ └── main.rs # Hauptanwendungslogik
|
||||
├── chromedriver-win64/ # ChromeDriver Binary
|
||||
├── Cargo.toml # Rust Abhängigkeiten
|
||||
├── Cargo.lock # Versionssperren
|
||||
├── countries.json # Länderreferenzdaten
|
||||
├── continents.json # Kontinentreferenzdaten
|
||||
└── README.md # Diese Datei
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Datenmodell
|
||||
|
||||
Extrahiert werden `EconomicEvent`-Strukturen mit folgenden Feldern:
|
||||
|
||||
```rust
|
||||
struct EconomicEvent {
|
||||
country: String, // Herkunftsland
|
||||
date: String, // Datum (ISO-Format)
|
||||
time: String, // Uhrzeit
|
||||
event: String, // Ereignisname
|
||||
actual: String, // Tatsächlicher Wert
|
||||
forecast: String, // Prognosewert
|
||||
previous: String, // Vorheriger Wert
|
||||
importance: String, // Wichtigkeit (z. B. "High")
|
||||
description: String // Beschreibung
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Installation & Einrichtung
|
||||
|
||||
### Voraussetzungen
|
||||
|
||||
* **Rust Toolchain** installieren:
|
||||
|
||||
```bash
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs/ | sh
|
||||
```
|
||||
|
||||
* **ChromeDriver** herunterladen:
|
||||
|
||||
* Webseite: `https://chromedriver.storage.googleapis.com/index.html`
|
||||
* Oder: `https://googlechromelabs.github.io/chrome-for-testing/`
|
||||
* Entpacke in `chromedriver-win64/` Verzeichnis
|
||||
|
||||
* **Chrome Browser** muss installiert sein.
|
||||
|
||||
### Build & Ausführung
|
||||
|
||||
```bash
|
||||
# Projekt klonen/erstellen
|
||||
git clone <repository-url>
|
||||
cd WebScraper
|
||||
|
||||
# Abhängigkeiten herunterladen
|
||||
cargo fetch
|
||||
|
||||
# Projekt kompilieren und ausführen
|
||||
cargo run --release
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ Konfiguration
|
||||
|
||||
### Datumsbereich
|
||||
|
||||
Standardmäßig extrahiert der Scraper Daten zwischen konfigurierbaren Grenzen. Beispiel-Aufruf in `main()`:
|
||||
|
||||
```rust
|
||||
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
||||
```
|
||||
|
||||
### Chrome-Optionen
|
||||
|
||||
Chrome-Verhalten kann in den Capabilities angepasst werden, z. B.:
|
||||
|
||||
```json
|
||||
"args": [
|
||||
"--disable-gpu",
|
||||
"--disable-notifications",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-blink-features=AutomationControlled"
|
||||
]
|
||||
```
|
||||
|
||||
> Hinweis: Für Headless- oder Headful-Ausführung kann das `--headless`-Flag je nach Use Case hinzugefügt oder entfernt werden.
|
||||
|
||||
---
|
||||
|
||||
## 📈 Ausführungsablauf
|
||||
|
||||
1. **Initialisierung:** ChromeDriver starten, Browser-Session aufbauen
|
||||
2. **Navigation:** Zielseite (`https://www.finanzen.net/termine/wirtschaftsdaten/`) aufrufen
|
||||
3. **Overlay-Handling:** Störende Elemente (Cookie/Consent) entfernen oder umgehen
|
||||
4. **Tab-Auswahl:** Hochwichtige Ereignisse (3 Sterne) auswählen
|
||||
5. **Chunked Extraction:**
|
||||
|
||||
* Datumsbereich in Blöcke aufteilen
|
||||
* JavaScript-basierte Datenextraktion
|
||||
* Automatische Paginierung / "Load more"
|
||||
6. **Datenvalidierung:** Konsistenz- und Qualitätsprüfungen
|
||||
7. **Export:** JSON-Dateien mit Zeitstempel generieren
|
||||
|
||||
---
|
||||
|
||||
## 🔍 Datenqualitätsprüfungen
|
||||
|
||||
Der Scraper führt folgende Prüfungen durch:
|
||||
|
||||
* **Duplikaterkennung:** Identische Events werden entfernt
|
||||
* **Zeitformat-Validierung:** Korrekte `HH:MM` Formatierung
|
||||
* **Datumsbereichsprüfung:** Extrahierte Events liegen im Zielzeitraum
|
||||
* **Vollständigkeitscheck:** Kritische Felder müssen vorhanden sein
|
||||
* **Beschreibungsabdeckung:** Prüft, ob Beschreibungen für Events vorhanden sind
|
||||
* **Länder-/Monatsverteilung:** Statistische Auswertung
|
||||
|
||||
---
|
||||
|
||||
## 📤 Ausgabeformate
|
||||
|
||||
**Hauptexport**
|
||||
|
||||
* `economic_events_YYYYMMDD_HHMMSS_combined.json` — Vollständiger Datensatz
|
||||
|
||||
**Chunk-Exporte**
|
||||
|
||||
* `economic_events_YYYYMMDD_HHMMSS_chunk_X.json` — Zwischenstände pro Block
|
||||
|
||||
### Beispiel-Eintrag (JSON)
|
||||
|
||||
```json
|
||||
{
|
||||
"country": "USA",
|
||||
"date": "2024-01-15",
|
||||
"time": "14:30",
|
||||
"event": "Verbraucherpreisindex (CPI)",
|
||||
"actual": "3.4%",
|
||||
"forecast": "3.2%",
|
||||
"previous": "3.1%",
|
||||
"importance": "High",
|
||||
"description": "Monatliche Inflationsdaten für die USA"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🛡️ Fehlerbehandlung
|
||||
|
||||
* **Automatische Wiederholung:** Bei fehlgeschlagenen Extraktionen
|
||||
* **Graceful Degradation:** Fallback-Logiken für Datumsparsing
|
||||
* **Timeout-Management:** Angemessene Wartezeiten zwischen Interaktionen
|
||||
* **Ressourcenbereinigung:** Korrektes Schließen von Browser und Treiber
|
||||
|
||||
---
|
||||
|
||||
## 📊 Leistungsmerkmale
|
||||
|
||||
* **Parallelverarbeitung:** Asynchrone Operationen mit Tokio
|
||||
* **Speichereffizienz:** Chunk-basierte Verarbeitung großer Datensätze
|
||||
* **Netzwerkoptimierung:** Intelligente Delays zwischen Requests
|
||||
* **Robustheit:** Widerstandsfähig gegen Seitenänderungen
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Entwicklung
|
||||
|
||||
**Abhängigkeiten hinzufügen**
|
||||
|
||||
```bash
|
||||
cargo add <crate-name>
|
||||
```
|
||||
|
||||
**Debug-Modus**
|
||||
|
||||
```bash
|
||||
cargo run
|
||||
```
|
||||
|
||||
**Release-Build**
|
||||
|
||||
```bash
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
**Tests ausführen**
|
||||
|
||||
```bash
|
||||
cargo test
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🌐 Länderabdeckung
|
||||
|
||||
Der Scraper unterstützt 52 Länder und Regionen (siehe `countries.json`), darunter:
|
||||
|
||||
* USA, China, Deutschland, Japan, UK
|
||||
* Eurozone, Schweiz, Kanada, Australien
|
||||
* und viele weitere wichtige Volkswirtschaften
|
||||
|
||||
---
|
||||
|
||||
## chromedriver Download
|
||||
|
||||
https://chromedriver.storage.googleapis.com/index.html
|
||||
https://googlechromelabs.github.io/chrome-for-testing/
|
||||
509
src/main.rs
509
src/main.rs
@@ -1,3 +1,4 @@
|
||||
use chrono::{NaiveDate, Duration as ChronoDuration};
|
||||
use fantoccini::{ClientBuilder, Locator};
|
||||
use serde::Serialize;
|
||||
use serde_json::{Map, Value};
|
||||
@@ -59,7 +60,30 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
const cells = row.querySelectorAll('td');
|
||||
|
||||
if (cells.length === 1 && cells[0].colSpan === 9) {
|
||||
currentDate = cells[0].textContent.trim();
|
||||
// This is a date header row - extract and parse the date
|
||||
const dateText = cells[0].textContent.trim();
|
||||
console.log('Found date header:', dateText);
|
||||
|
||||
// Convert German date to ISO format (YYYY-MM-DD)
|
||||
const monthMap = {
|
||||
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
|
||||
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
|
||||
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
|
||||
};
|
||||
|
||||
// Extract date parts from German format "Montag, 30. April 2007"
|
||||
const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})/);
|
||||
if (dateParts) {
|
||||
const day = dateParts[1].padStart(2, '0');
|
||||
const germanMonth = dateParts[2];
|
||||
const year = dateParts[3];
|
||||
const month = monthMap[germanMonth] || '01';
|
||||
currentDate = `${year}-${month}-${day}`;
|
||||
console.log('Converted date:', currentDate, 'from:', dateText);
|
||||
} else {
|
||||
console.log('Failed to parse date:', dateText);
|
||||
currentDate = ''; // Reset if parsing fails
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -90,7 +114,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
|
||||
events.push({
|
||||
country: country,
|
||||
date: currentDate,
|
||||
date: currentDate, // Now using ISO format date
|
||||
time: time,
|
||||
event: eventName,
|
||||
actual: cells[7]?.textContent?.trim() || '',
|
||||
@@ -103,6 +127,12 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
}
|
||||
}
|
||||
|
||||
console.log('Total events extracted:', events.length);
|
||||
if (events.length > 0) {
|
||||
console.log('First event date:', events[0].date);
|
||||
console.log('Last event date:', events[events.length - 1].date);
|
||||
}
|
||||
|
||||
return events;
|
||||
"#;
|
||||
|
||||
@@ -128,69 +158,31 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
||||
}
|
||||
}
|
||||
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
|
||||
|
||||
// Debug: show date range of extracted events
|
||||
if !events.is_empty() {
|
||||
let dates: Vec<&str> = events.iter().map(|e| e.date.as_str()).filter(|d| !d.is_empty()).collect();
|
||||
if !dates.is_empty() {
|
||||
let min_date = dates.iter().min().unwrap_or(&"N/A");
|
||||
let max_date = dates.iter().max().unwrap_or(&"N/A");
|
||||
println!("📅 Extracted date range: {} to {}", min_date, max_date);
|
||||
|
||||
// Show sample of dates for debugging
|
||||
println!("Sample dates:");
|
||||
for (i, date) in dates.iter().take(5).enumerate() {
|
||||
println!(" {}. {}", i + 1, date);
|
||||
}
|
||||
} else {
|
||||
println!("❌ No valid dates found in extracted events");
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(events);
|
||||
}
|
||||
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
|
||||
println!("Extracting event descriptions via JavaScript (3 YELLOW stars only)...");
|
||||
|
||||
let description_script = r#"
|
||||
const descriptions = {};
|
||||
|
||||
// Find all description rows (they have class starting with 'teletrader')
|
||||
const descRows = document.querySelectorAll('tr td[class*="teletrader"]');
|
||||
|
||||
for (const descRow of descRows) {
|
||||
// Get the description text from the <p> tag
|
||||
const descPara = descRow.querySelector('p');
|
||||
if (descPara) {
|
||||
const description = descPara.textContent?.trim() || '';
|
||||
|
||||
// Find the corresponding event name by looking for the row above
|
||||
let eventRow = descRow.parentElement.previousElementSibling;
|
||||
if (eventRow) {
|
||||
// Check if this is a 3 YELLOW star event
|
||||
const importanceCell = eventRow.querySelector('td:nth-child(4)');
|
||||
if (importanceCell) {
|
||||
// Count ONLY YELLOW stars
|
||||
const yellowStarCount = importanceCell.querySelectorAll('.icon--star.font-color-yellow').length;
|
||||
|
||||
// Only process events with 3 YELLOW stars
|
||||
if (yellowStarCount === 3) {
|
||||
const eventCell = eventRow.querySelector('td:nth-child(5)');
|
||||
if (eventCell) {
|
||||
const eventName = eventCell.textContent?.trim() || '';
|
||||
if (eventName) {
|
||||
descriptions[eventName] = description;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return descriptions;
|
||||
"#;
|
||||
|
||||
let result = client.execute(description_script, vec![]).await?;
|
||||
|
||||
let mut event_type_map = HashMap::new();
|
||||
if let Some(desc_obj) = result.as_object() {
|
||||
for (key, value) in desc_obj {
|
||||
if let Some(desc_text) = value.as_str() {
|
||||
event_type_map.entry(key.clone()).or_insert(desc_text.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("Extracted {} event descriptions (3 YELLOW stars only)", event_type_map.len());
|
||||
Ok(event_type_map)
|
||||
}
|
||||
|
||||
async fn check_data_consistency(events: &[EconomicEvent]) {
|
||||
println!("\n=== DATA CONSISTENCY CHECKS ===");
|
||||
|
||||
@@ -328,17 +320,269 @@ fn extract_month(date_str: &str) -> Option<String> {
|
||||
None
|
||||
}
|
||||
|
||||
fn calculate_actual_date_range(events: &[EconomicEvent]) -> (String, String) {
|
||||
if events.is_empty() {
|
||||
return ("No data".to_string(), "No data".to_string());
|
||||
}
|
||||
|
||||
let mut dates: Vec<NaiveDate> = events
|
||||
.iter()
|
||||
.filter_map(|e| {
|
||||
// Parse German date format "Dienstag, 2. Januar 2024"
|
||||
extract_date_from_german_format(&e.date)
|
||||
})
|
||||
.collect();
|
||||
|
||||
dates.sort();
|
||||
|
||||
let earliest = dates.first().map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
let latest = dates.last().map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
|
||||
(earliest, latest)
|
||||
}
|
||||
|
||||
fn extract_date_from_german_format(german_date: &str) -> Option<NaiveDate> {
|
||||
// Map German month names to English
|
||||
let month_map = [
|
||||
("Januar", "January"),
|
||||
("Februar", "February"),
|
||||
("März", "March"),
|
||||
("April", "April"),
|
||||
("Mai", "May"),
|
||||
("Juni", "June"),
|
||||
("Juli", "July"),
|
||||
("August", "August"),
|
||||
("September", "September"),
|
||||
("Oktober", "October"),
|
||||
("November", "November"),
|
||||
("Dezember", "December"),
|
||||
];
|
||||
|
||||
let mut english_date = german_date.to_string();
|
||||
for (de, en) in &month_map {
|
||||
english_date = english_date.replace(de, en);
|
||||
}
|
||||
|
||||
// Parse "Tuesday, 2. January 2024" format
|
||||
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
|
||||
}
|
||||
|
||||
fn parse_german_date(german_date: &str) -> Option<NaiveDate> {
|
||||
if german_date.trim().is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Map German month names to numbers
|
||||
let month_map = [
|
||||
("Januar", 1), ("Februar", 2), ("März", 3), ("April", 4),
|
||||
("Mai", 5), ("Juni", 6), ("Juli", 7), ("August", 8),
|
||||
("September", 9), ("Oktober", 10), ("November", 11), ("Dezember", 12)
|
||||
];
|
||||
|
||||
// Parse German format: "Montag, 30. April 2007"
|
||||
let pattern = r"(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})";
|
||||
let re = regex::Regex::new(pattern).unwrap();
|
||||
|
||||
if let Some(caps) = re.captures(german_date) {
|
||||
let day = caps.get(1).unwrap().as_str().parse::<u32>().ok()?;
|
||||
let german_month = caps.get(2).unwrap().as_str();
|
||||
let year = caps.get(3).unwrap().as_str().parse::<i32>().ok()?;
|
||||
|
||||
// Find the month number
|
||||
let month = month_map.iter()
|
||||
.find(|(name, _)| *name == german_month)
|
||||
.map(|(_, num)| *num)?;
|
||||
|
||||
NaiveDate::from_ymd_opt(year, month, day)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_next_start_date(events: &[EconomicEvent]) -> Result<String, anyhow::Error> {
|
||||
// Try to find dates in ISO format first
|
||||
let iso_dates: Vec<NaiveDate> = events
|
||||
.iter()
|
||||
.filter_map(|e| NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
.collect();
|
||||
|
||||
if !iso_dates.is_empty() {
|
||||
if let Some(latest) = iso_dates.iter().max() {
|
||||
let next_date = (*latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string();
|
||||
println!("📅 Calculated next start date from ISO: {} (from latest: {})", next_date, latest);
|
||||
return Ok(next_date);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: parse German dates
|
||||
println!("⚠️ No ISO dates found, trying to parse German dates...");
|
||||
let german_dates: Vec<NaiveDate> = events
|
||||
.iter()
|
||||
.filter_map(|e| parse_german_date(&e.date))
|
||||
.collect();
|
||||
|
||||
if let Some(latest) = german_dates.iter().max() {
|
||||
let next_date = (*latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string();
|
||||
println!("📅 Calculated next start date from German: {} (from latest: {})", next_date, latest);
|
||||
Ok(next_date)
|
||||
} else {
|
||||
// Final fallback: use manual date increment
|
||||
println!("❌ No parseable dates found, using manual increment");
|
||||
Err(anyhow::anyhow!("No parseable dates found"))
|
||||
}
|
||||
}
|
||||
|
||||
async fn scrape_all_events_with_chunking(
|
||||
client: &fantoccini::Client,
|
||||
start_date: &str,
|
||||
end_date: &str
|
||||
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
let json_export_now = chrono::Local::now().format("%Y%m%d_%H%M%S");
|
||||
|
||||
let mut all_events: Vec<EconomicEvent> = Vec::new();
|
||||
let mut current_start = start_date.to_string();
|
||||
let mut attempts = 0;
|
||||
let max_attempts = 300;
|
||||
|
||||
loop {
|
||||
attempts += 1;
|
||||
if attempts > max_attempts {
|
||||
println!("⚠️ Reached maximum attempts ({})", max_attempts);
|
||||
break;
|
||||
}
|
||||
|
||||
println!("🚀 Chunk {}: {} to {}", attempts, current_start, end_date);
|
||||
|
||||
// Set dates for current chunk
|
||||
set_date_range(client, ¤t_start, end_date).await?;
|
||||
|
||||
// Wait a bit longer for table to load
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
// Extract events
|
||||
let chunk_events = extract_all_data_via_js(client).await?;
|
||||
|
||||
if chunk_events.is_empty() {
|
||||
println!("✅ No more events found. Completed!");
|
||||
break;
|
||||
}
|
||||
|
||||
// Add to total
|
||||
let chunk_count = chunk_events.len();
|
||||
all_events.extend(chunk_events.clone());
|
||||
|
||||
println!("📊 Chunk {}: {} events (Total: {})",
|
||||
attempts, chunk_count, all_events.len());
|
||||
|
||||
// Debug: check what dates we got
|
||||
let sample_dates: Vec<&str> = chunk_events.iter()
|
||||
.map(|e| e.date.as_str())
|
||||
.filter(|d| !d.is_empty())
|
||||
.take(3)
|
||||
.collect();
|
||||
println!(" Sample dates in chunk: {:?}", sample_dates);
|
||||
|
||||
// Calculate next start date
|
||||
match calculate_next_start_date(&chunk_events) {
|
||||
Ok(next_start) => {
|
||||
if next_start > end_date.to_string() {
|
||||
println!("✅ Reached end date. Completed!");
|
||||
break;
|
||||
}
|
||||
current_start = next_start;
|
||||
}
|
||||
Err(_) => {
|
||||
println!("❌ Could not calculate next start date. Stopping.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Small delay between requests
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// Export chunk
|
||||
if let Ok(json) = serde_json::to_string_pretty(&chunk_events) {
|
||||
let filename = format!("economic_events_{}_chunk_{}.json", json_export_now, attempts);
|
||||
tokio::fs::write(&filename, json).await?;
|
||||
println!(" Chunk data exported to: {}", filename);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove duplicates
|
||||
let initial_count = all_events.len();
|
||||
all_events.sort_by(|a, b| {
|
||||
a.date.cmp(&b.date)
|
||||
.then(a.time.cmp(&b.time))
|
||||
.then(a.event.cmp(&b.event))
|
||||
});
|
||||
all_events.dedup_by(|a, b| {
|
||||
a.date == b.date && a.time == b.time && a.event == b.event
|
||||
});
|
||||
|
||||
println!("🎯 FINAL: Collected {} unique events (removed {} duplicates)",
|
||||
all_events.len(), initial_count - all_events.len());
|
||||
|
||||
Ok(all_events)
|
||||
}
|
||||
|
||||
async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||
let set_dates_script = format!(r#"
|
||||
(() => {{
|
||||
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
||||
const toInput = document.querySelector('#dtTeletraderEndDate');
|
||||
|
||||
if (fromInput) {{
|
||||
fromInput.value = '{}';
|
||||
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||
}}
|
||||
|
||||
if (toInput) {{
|
||||
toInput.value = '{}';
|
||||
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||
}}
|
||||
|
||||
return !!fromInput && !!toInput;
|
||||
}})()
|
||||
"#, start, end);
|
||||
|
||||
client.execute(&set_dates_script, vec![]).await?;
|
||||
sleep(Duration::from_millis(1000)).await; // Wait for table to update
|
||||
|
||||
// Now read the values
|
||||
let from_date_value: String = client.execute(
|
||||
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
|
||||
vec![],
|
||||
).await?.as_str().unwrap_or_default().to_string();
|
||||
|
||||
let to_date_value: String = client.execute(
|
||||
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
|
||||
vec![],
|
||||
).await?.as_str().unwrap_or_default().to_string();
|
||||
|
||||
if from_date_value == start && to_date_value == end {
|
||||
println!(" Dates set correctly");
|
||||
} else {
|
||||
println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}",
|
||||
start, end, from_date_value, to_date_value);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let port = 9515; // pick a port you like
|
||||
let port = 9515;
|
||||
let mut chromedriver = start_chromedriver(port);
|
||||
sleep(Duration::from_secs(1)).await; // wait for ChromeDriver to start
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
|
||||
// Chrome options (non-headless so it opens)
|
||||
// Chrome options
|
||||
let caps_value = serde_json::json!({
|
||||
"goog:chromeOptions": {
|
||||
"args": [
|
||||
//"--headless",
|
||||
"--disable-gpu",
|
||||
"--disable-notifications",
|
||||
"--disable-popup-blocking",
|
||||
@@ -371,130 +615,49 @@ async fn main() -> anyhow::Result<()> {
|
||||
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
||||
client.goto(url).await?;
|
||||
|
||||
// Set start and end dates
|
||||
let start_date = "2024-01-01";
|
||||
let end_date = "2025-01-01";
|
||||
// Dismiss overlays
|
||||
dismiss_overlays(&client).await?;
|
||||
|
||||
let set_dates_script = format!(r#"
|
||||
(() => {{
|
||||
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
||||
const toInput = document.querySelector('#dtTeletraderEndDate');
|
||||
|
||||
if (fromInput) {{
|
||||
fromInput.value = '{}';
|
||||
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||
}}
|
||||
|
||||
if (toInput) {{
|
||||
toInput.value = '{}';
|
||||
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||
}}
|
||||
|
||||
return !!fromInput && !!toInput;
|
||||
}})()
|
||||
"#, start_date, end_date);
|
||||
|
||||
// Execute JS to set dates and get the raw response
|
||||
let _ = client.execute(&set_dates_script, vec![]).await;
|
||||
|
||||
// Give React time to process
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
|
||||
// Now read the values
|
||||
let from_date_value: String = client.execute(
|
||||
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
|
||||
vec![],
|
||||
).await?.as_str().unwrap_or_default().to_string();
|
||||
|
||||
let to_date_value: String = client.execute(
|
||||
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
|
||||
vec![],
|
||||
).await?.as_str().unwrap_or_default().to_string();
|
||||
|
||||
println!("From Date: {}", from_date_value);
|
||||
println!("To Date: {}", to_date_value);
|
||||
|
||||
if from_date_value == start_date && to_date_value == end_date {
|
||||
println!("Dates set correctly");
|
||||
// Click the high importance tab
|
||||
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||
tab.click().await?;
|
||||
println!("High importance tab clicked");
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
} else {
|
||||
println!("Date not set correctly");
|
||||
println!("High importance tab not found");
|
||||
}
|
||||
|
||||
// Find all table rows
|
||||
let rows = client.find_all(Locator::Css(
|
||||
"#TeletraderForm table.table tbody tr"
|
||||
)).await?;
|
||||
// Use chunking to extract all events across the entire date range
|
||||
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
||||
|
||||
println!("Found {} table rows", rows.len());
|
||||
|
||||
// HashMap to store "Termin" -> description
|
||||
let mut event_type_map: HashMap<String, String> = HashMap::new();
|
||||
|
||||
let mut i = 0;
|
||||
while i < rows.len() {
|
||||
let row = &rows[i];
|
||||
|
||||
// Extract all cells
|
||||
let cells = row.find_all(Locator::Css("td")).await?;
|
||||
|
||||
if cells.len() >= 5 {
|
||||
// Get Termin column text
|
||||
let termin_text = cells[4].text().await.unwrap_or_default();
|
||||
|
||||
// Check if next row is a hidden description row
|
||||
if i + 1 < rows.len() {
|
||||
let next_row = &rows[i + 1];
|
||||
let class = next_row.attr("class").await.unwrap_or(None).unwrap_or_default();
|
||||
if class.starts_with("table__td teletrader") {
|
||||
// Get the hidden description
|
||||
let desc_cell = next_row.find(Locator::Css("td")).await?;
|
||||
let desc_text = desc_cell.text().await.unwrap_or_default();
|
||||
event_type_map.insert(termin_text.clone(), desc_text);
|
||||
i += 1; // skip next row since it's the hidden description
|
||||
} else {
|
||||
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||
}
|
||||
} else {
|
||||
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// Extract using JavaScript
|
||||
let events = extract_all_data_via_js(&client).await?;
|
||||
|
||||
// Extract descriptions using JavaScript
|
||||
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
||||
|
||||
// Merge descriptions with events
|
||||
let events_with_descriptions: Vec<EconomicEvent> = events.clone().into_iter()
|
||||
.map(|mut event| {
|
||||
if let Some(description) = event_type_map.get(&event.event) {
|
||||
event.description = description.clone();
|
||||
}
|
||||
event
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Run validation suite
|
||||
validate_events(&events).await?;
|
||||
check_data_consistency(&events).await;
|
||||
|
||||
|
||||
// Calculate actual date range from extracted data
|
||||
let actual_date_range = calculate_actual_date_range(&events);
|
||||
let current_date = chrono::Local::now().format("%Y-%m-%d").to_string();
|
||||
|
||||
// Final summary
|
||||
println!("\n🎯 EXTRACTION SUMMARY:");
|
||||
println!(" • Total high-importance events: {}", events.len());
|
||||
println!(" • Date range: 2024-01-01 to 2025-01-01");
|
||||
println!(" • Requested range: 2007-02-13 to 2025-12-01");
|
||||
println!(" • Actual extracted range: {} to {}", actual_date_range.0, actual_date_range.1);
|
||||
println!(" • Data extracted until: {}", current_date);
|
||||
println!(" • Data quality: {}% complete",
|
||||
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
||||
|
||||
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
||||
|
||||
// Check coverage
|
||||
if actual_date_range.1 < "2025-12-01".to_string() {
|
||||
println!("⚠️ WARNING: Did not reach end date. Last extracted date: {}", actual_date_range.1);
|
||||
println!(" • Next run should start from: {}", calculate_next_start_date(&events).unwrap_or_else(|_| actual_date_range.1));
|
||||
}
|
||||
|
||||
// Export for further analysis
|
||||
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
||||
tokio::fs::write("economic_events.json", json).await?;
|
||||
println!(" • Data exported to: economic_events.json");
|
||||
let filename = format!("economic_events_{}_combined.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));
|
||||
tokio::fs::write(&filename, json).await?;
|
||||
println!(" • Combined data exported to: {}", filename);
|
||||
}
|
||||
|
||||
// Wait for Ctrl+C
|
||||
|
||||
Reference in New Issue
Block a user