Compare commits
6 Commits
645f7a546b
...
a91447cace
| Author | SHA1 | Date | |
|---|---|---|---|
| a91447cace | |||
| d6e244c8d8 | |||
| 0853124918 | |||
| 59aad09f71 | |||
| 2604caab0e | |||
| e6729b06b8 |
7
.gitignore
vendored
7
.gitignore
vendored
@@ -16,3 +16,10 @@ target/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
|
||||||
|
# Added by cargo
|
||||||
|
|
||||||
|
/target
|
||||||
|
|
||||||
|
/chromedriver_win32/*
|
||||||
1496
Cargo.lock
generated
Normal file
1496
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
13
Cargo.toml
Normal file
13
Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
[package]
|
||||||
|
name = "WebScraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
fantoccini = { version = "0.21.5", default-features = false, features = ["native-tls"] }
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
anyhow = "1.0"
|
||||||
|
futures = "0.3"
|
||||||
|
serde_json = "1.0"
|
||||||
|
chrono = "0.4.42"
|
||||||
@@ -1,2 +1,4 @@
|
|||||||
# WebScraper
|
# WebScraper
|
||||||
|
|
||||||
|
https://chromedriver.storage.googleapis.com/index.html
|
||||||
|
https://googlechromelabs.github.io/chrome-for-testing/
|
||||||
27
chromedriver-win64/LICENSE.chromedriver
Normal file
27
chromedriver-win64/LICENSE.chromedriver
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
// Copyright 2015 The Chromium Authors
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
// modification, are permitted provided that the following conditions are
|
||||||
|
// met:
|
||||||
|
//
|
||||||
|
// * Redistributions of source code must retain the above copyright
|
||||||
|
// notice, this list of conditions and the following disclaimer.
|
||||||
|
// * Redistributions in binary form must reproduce the above
|
||||||
|
// copyright notice, this list of conditions and the following disclaimer
|
||||||
|
// in the documentation and/or other materials provided with the
|
||||||
|
// distribution.
|
||||||
|
// * Neither the name of Google LLC nor the names of its
|
||||||
|
// contributors may be used to endorse or promote products derived from
|
||||||
|
// this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
14682
chromedriver-win64/THIRD_PARTY_NOTICES.chromedriver
Normal file
14682
chromedriver-win64/THIRD_PARTY_NOTICES.chromedriver
Normal file
File diff suppressed because it is too large
Load Diff
BIN
chromedriver-win64/chromedriver.exe
Normal file
BIN
chromedriver-win64/chromedriver.exe
Normal file
Binary file not shown.
2642
economic_events.json
Normal file
2642
economic_events.json
Normal file
File diff suppressed because it is too large
Load Diff
9
src/continents.json
Normal file
9
src/continents.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[
|
||||||
|
"afrika",
|
||||||
|
"asien",
|
||||||
|
"europa",
|
||||||
|
"nordamerika",
|
||||||
|
"suedamerika",
|
||||||
|
"antarktis",
|
||||||
|
"ozeanien"
|
||||||
|
]
|
||||||
54
src/countries.json
Normal file
54
src/countries.json
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
[
|
||||||
|
"aegypten",
|
||||||
|
"frankreich",
|
||||||
|
"litauen",
|
||||||
|
"schweiz",
|
||||||
|
"argentinien",
|
||||||
|
"griechenland",
|
||||||
|
"mexiko",
|
||||||
|
"singapur",
|
||||||
|
"australien",
|
||||||
|
"hongkong",
|
||||||
|
"neuseeland",
|
||||||
|
"slowakei",
|
||||||
|
"bahrain",
|
||||||
|
"indien",
|
||||||
|
"niederlande",
|
||||||
|
"spanien",
|
||||||
|
"belgien",
|
||||||
|
"indonesien",
|
||||||
|
"norwegen",
|
||||||
|
"suedafrika",
|
||||||
|
"brasilien",
|
||||||
|
"irland",
|
||||||
|
"oesterreich",
|
||||||
|
"suedkorea",
|
||||||
|
"chile",
|
||||||
|
"island",
|
||||||
|
"peru",
|
||||||
|
"taiwan",
|
||||||
|
"china",
|
||||||
|
"italien",
|
||||||
|
"philippinen",
|
||||||
|
"tschechien",
|
||||||
|
"daenemark",
|
||||||
|
"japan",
|
||||||
|
"polen",
|
||||||
|
"tuerkei",
|
||||||
|
"deutschland",
|
||||||
|
"kanada",
|
||||||
|
"portugal",
|
||||||
|
"ungarn",
|
||||||
|
"estland",
|
||||||
|
"katar",
|
||||||
|
"rumaenien",
|
||||||
|
"usa",
|
||||||
|
"eurozone",
|
||||||
|
"kolumbien",
|
||||||
|
"russland",
|
||||||
|
"vereinigte-arabische-emirate",
|
||||||
|
"finnland",
|
||||||
|
"lettland",
|
||||||
|
"schweden",
|
||||||
|
"vereinigtes-koenigreich"
|
||||||
|
]
|
||||||
569
src/main.rs
Normal file
569
src/main.rs
Normal file
@@ -0,0 +1,569 @@
|
|||||||
|
use fantoccini::{ClientBuilder, Locator};
|
||||||
|
use serde::Serialize;
|
||||||
|
use serde_json::{Map, Value};
|
||||||
|
use std::{collections::HashMap, process::Command};
|
||||||
|
use tokio::{time::{Duration, sleep}, signal};
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Clone)]
|
||||||
|
struct EconomicEvent {
|
||||||
|
country: String,
|
||||||
|
date: String,
|
||||||
|
time: String,
|
||||||
|
event: String,
|
||||||
|
actual: String,
|
||||||
|
forecast: String,
|
||||||
|
previous: String,
|
||||||
|
importance: String,
|
||||||
|
description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn start_chromedriver(port: u16) -> std::process::Child {
|
||||||
|
Command::new("chromedriver-win64/chromedriver.exe")
|
||||||
|
.args(&[format!("--port={}", port)])
|
||||||
|
.spawn()
|
||||||
|
.expect("Failed to start ChromeDriver")
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||||
|
// Single strategy: wait for and remove iframe
|
||||||
|
for _ in 0..10 {
|
||||||
|
let removed: bool = client.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
|
if (iframe && iframe.parentNode) {
|
||||||
|
iframe.parentNode.removeChild(iframe);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
})()"#,
|
||||||
|
vec![]
|
||||||
|
).await?.as_bool().unwrap_or(false);
|
||||||
|
|
||||||
|
if removed { break; }
|
||||||
|
sleep(Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
|
println!("Extracting ONLY 3-star events via JavaScript...");
|
||||||
|
|
||||||
|
let extraction_script = r#"
|
||||||
|
const events = [];
|
||||||
|
let currentDate = '';
|
||||||
|
|
||||||
|
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
|
||||||
|
|
||||||
|
for (let i = 0; i < rows.length; i++) {
|
||||||
|
const row = rows[i];
|
||||||
|
const cells = row.querySelectorAll('td');
|
||||||
|
|
||||||
|
if (cells.length === 1 && cells[0].colSpan === 9) {
|
||||||
|
currentDate = cells[0].textContent.trim();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cells.length >= 8) {
|
||||||
|
const time = cells[0]?.textContent?.trim() || '';
|
||||||
|
const country = cells[2]?.textContent?.trim() || '';
|
||||||
|
const eventName = cells[4]?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
if (!time || !country || !eventName) continue;
|
||||||
|
|
||||||
|
// Count ONLY YELLOW stars (high importance)
|
||||||
|
const importanceCell = cells[3];
|
||||||
|
const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
|
||||||
|
|
||||||
|
// STRICT FILTER: Only include events with EXACTLY 3 YELLOW stars
|
||||||
|
if (yellowStarCount === 3) {
|
||||||
|
let description = '';
|
||||||
|
if (i + 1 < rows.length) {
|
||||||
|
const nextRow = rows[i + 1];
|
||||||
|
const nextCells = nextRow.querySelectorAll('td');
|
||||||
|
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {
|
||||||
|
const descPara = nextRow.querySelector('p');
|
||||||
|
if (descPara) {
|
||||||
|
description = descPara.textContent?.trim() || '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
events.push({
|
||||||
|
country: country,
|
||||||
|
date: currentDate,
|
||||||
|
time: time,
|
||||||
|
event: eventName,
|
||||||
|
actual: cells[7]?.textContent?.trim() || '',
|
||||||
|
forecast: cells[6]?.textContent?.trim() || '',
|
||||||
|
previous: cells[5]?.textContent?.trim() || '',
|
||||||
|
importance: 'High',
|
||||||
|
description: description
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return events;
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let result = client.execute(extraction_script, vec![]).await?;
|
||||||
|
|
||||||
|
// Parse the JSON result into EconomicEvent structs
|
||||||
|
if let Some(events_array) = result.as_array() {
|
||||||
|
let mut events = Vec::new();
|
||||||
|
for event_value in events_array {
|
||||||
|
if let Some(event_obj) = event_value.as_object() {
|
||||||
|
let event = EconomicEvent {
|
||||||
|
country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
|
};
|
||||||
|
events.push(event);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
|
||||||
|
return Ok(events);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(vec![])
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
|
||||||
|
println!("Extracting event descriptions via JavaScript (3 YELLOW stars only)...");
|
||||||
|
|
||||||
|
let description_script = r#"
|
||||||
|
const descriptions = {};
|
||||||
|
|
||||||
|
// Find all description rows (they have class starting with 'teletrader')
|
||||||
|
const descRows = document.querySelectorAll('tr td[class*="teletrader"]');
|
||||||
|
|
||||||
|
for (const descRow of descRows) {
|
||||||
|
// Get the description text from the <p> tag
|
||||||
|
const descPara = descRow.querySelector('p');
|
||||||
|
if (descPara) {
|
||||||
|
const description = descPara.textContent?.trim() || '';
|
||||||
|
|
||||||
|
// Find the corresponding event name by looking for the row above
|
||||||
|
let eventRow = descRow.parentElement.previousElementSibling;
|
||||||
|
if (eventRow) {
|
||||||
|
// Check if this is a 3 YELLOW star event
|
||||||
|
const importanceCell = eventRow.querySelector('td:nth-child(4)');
|
||||||
|
if (importanceCell) {
|
||||||
|
// Count ONLY YELLOW stars
|
||||||
|
const yellowStarCount = importanceCell.querySelectorAll('.icon--star.font-color-yellow').length;
|
||||||
|
|
||||||
|
// Only process events with 3 YELLOW stars
|
||||||
|
if (yellowStarCount === 3) {
|
||||||
|
const eventCell = eventRow.querySelector('td:nth-child(5)');
|
||||||
|
if (eventCell) {
|
||||||
|
const eventName = eventCell.textContent?.trim() || '';
|
||||||
|
if (eventName) {
|
||||||
|
descriptions[eventName] = description;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return descriptions;
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let result = client.execute(description_script, vec![]).await?;
|
||||||
|
|
||||||
|
let mut event_type_map = HashMap::new();
|
||||||
|
if let Some(desc_obj) = result.as_object() {
|
||||||
|
for (key, value) in desc_obj {
|
||||||
|
if let Some(desc_text) = value.as_str() {
|
||||||
|
event_type_map.entry(key.clone()).or_insert(desc_text.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Extracted {} event descriptions (3 YELLOW stars only)", event_type_map.len());
|
||||||
|
Ok(event_type_map)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn check_data_consistency(events: &[EconomicEvent]) {
|
||||||
|
println!("\n=== DATA CONSISTENCY CHECKS ===");
|
||||||
|
|
||||||
|
// Count event name occurrences
|
||||||
|
let mut event_names: HashMap<String, usize> = HashMap::new();
|
||||||
|
for event in events {
|
||||||
|
*event_names.entry(event.event.clone()).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect duplicates
|
||||||
|
let duplicates: Vec<_> = event_names
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, count)| **count > 1)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !duplicates.is_empty() {
|
||||||
|
println!("⚠️ Found {} duplicate event names:", duplicates.len());
|
||||||
|
for (name, count) in duplicates.iter().take(5) {
|
||||||
|
println!(" - '{}' appears {} times", name, count);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("✅ No duplicate event names found");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check time format consistency
|
||||||
|
let valid_time_format = events.iter()
|
||||||
|
.filter(|e| {
|
||||||
|
// Time should be in format "HH:MM"
|
||||||
|
e.time.len() == 5 &&
|
||||||
|
e.time.chars().nth(2) == Some(':') &&
|
||||||
|
e.time[0..2].chars().all(|c| c.is_ascii_digit()) &&
|
||||||
|
e.time[3..5].chars().all(|c| c.is_ascii_digit())
|
||||||
|
})
|
||||||
|
.count();
|
||||||
|
|
||||||
|
println!("⏰ Valid time formats: {}/{}", valid_time_format, events.len());
|
||||||
|
|
||||||
|
// Check for missing critical data
|
||||||
|
let critical_fields_missing: Vec<_> = events.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(_, e)| e.event.trim().is_empty() || e.time.trim().is_empty())
|
||||||
|
.map(|(i, e)| (i, e))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !critical_fields_missing.is_empty() {
|
||||||
|
println!("❌ {} events missing critical fields", critical_fields_missing.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> {
|
||||||
|
println!("\n=== EVENT VALIDATION ===");
|
||||||
|
|
||||||
|
// Check if we have any events at all
|
||||||
|
if events.is_empty() {
|
||||||
|
println!("❌ ERROR: No events extracted!");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("📊 Total events: {}", events.len());
|
||||||
|
|
||||||
|
// 1. Check date range compliance
|
||||||
|
let date_range_events: Vec<_> = events.iter()
|
||||||
|
.filter(|e| {
|
||||||
|
// Extract year from German date format "Dienstag, 2. Januar 2024"
|
||||||
|
e.date.contains("2024") || e.date.contains("2025")
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
println!("📅 Events in 2024-2025 range: {}/{}",
|
||||||
|
date_range_events.len(), events.len());
|
||||||
|
|
||||||
|
// 2. Check importance filtering
|
||||||
|
let high_importance_count = events.iter()
|
||||||
|
.filter(|e| e.importance == "High")
|
||||||
|
.count();
|
||||||
|
println!("⭐ High importance events: {}/{}", high_importance_count, events.len());
|
||||||
|
|
||||||
|
// 3. Check data completeness
|
||||||
|
let complete_events = events.iter()
|
||||||
|
.filter(|e| {
|
||||||
|
!e.event.trim().is_empty() &&
|
||||||
|
!e.time.trim().is_empty() &&
|
||||||
|
!e.country.trim().is_empty() &&
|
||||||
|
(!e.actual.trim().is_empty() || !e.forecast.trim().is_empty() || !e.previous.trim().is_empty())
|
||||||
|
})
|
||||||
|
.count();
|
||||||
|
|
||||||
|
println!("✅ Complete events: {}/{}", complete_events, events.len());
|
||||||
|
|
||||||
|
// 4. Check description coverage
|
||||||
|
let events_with_descriptions = events.iter()
|
||||||
|
.filter(|e| !e.description.trim().is_empty())
|
||||||
|
.count();
|
||||||
|
println!("📝 Events with descriptions: {}/{}", events_with_descriptions, events.len());
|
||||||
|
|
||||||
|
// 5. Distribution analysis
|
||||||
|
use std::collections::HashMap;
|
||||||
|
let mut country_distribution: HashMap<String, usize> = HashMap::new();
|
||||||
|
let mut month_distribution: HashMap<String, usize> = HashMap::new();
|
||||||
|
|
||||||
|
for event in events {
|
||||||
|
*country_distribution.entry(event.country.clone()).or_insert(0) += 1;
|
||||||
|
|
||||||
|
// Extract month from German date
|
||||||
|
if let Some(month) = extract_month(&event.date) {
|
||||||
|
*month_distribution.entry(month).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("🌍 Country distribution: {:?}", country_distribution);
|
||||||
|
println!("📈 Month distribution: {:?}", month_distribution);
|
||||||
|
|
||||||
|
// 6. Sample output for manual inspection
|
||||||
|
println!("\n🔍 Sample events (first 5):");
|
||||||
|
for event in events.iter().take(5) {
|
||||||
|
println!(" • {} {}: {} - {} (Importance: {})",
|
||||||
|
event.date, event.time, event.country, event.event, event.importance);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_month(date_str: &str) -> Option<String> {
|
||||||
|
// Extract month from German date format
|
||||||
|
let months = [
|
||||||
|
"Januar", "Februar", "März", "April", "Mai", "Juni",
|
||||||
|
"Juli", "August", "September", "Oktober", "November", "Dezember"
|
||||||
|
];
|
||||||
|
|
||||||
|
for month in months {
|
||||||
|
if date_str.contains(month) {
|
||||||
|
return Some(month.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
let port = 9515; // pick a port you like
|
||||||
|
let mut chromedriver = start_chromedriver(port);
|
||||||
|
sleep(Duration::from_secs(1)).await; // wait for ChromeDriver to start
|
||||||
|
|
||||||
|
// Chrome options (non-headless so it opens)
|
||||||
|
let caps_value = serde_json::json!({
|
||||||
|
"goog:chromeOptions": {
|
||||||
|
"args": [
|
||||||
|
//"--headless",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-notifications",
|
||||||
|
"--disable-popup-blocking",
|
||||||
|
"--disable-blink-features=AutomationControlled"
|
||||||
|
],
|
||||||
|
"excludeSwitches": ["enable-automation"]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let caps_map: Map<String, Value> = caps_value.as_object()
|
||||||
|
.expect("Capabilities should be a JSON object")
|
||||||
|
.clone();
|
||||||
|
|
||||||
|
let mut client = ClientBuilder::native()
|
||||||
|
.capabilities(caps_map)
|
||||||
|
.connect(&format!("http://localhost:{}", port))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Setup graceful shutdown on Ctrl+C
|
||||||
|
let shutdown_client = client.clone();
|
||||||
|
let shutdown_handle = tokio::spawn(async move {
|
||||||
|
signal::ctrl_c().await.expect("Failed to listen for ctrl+c");
|
||||||
|
println!("\nCtrl+C received, shutting down...");
|
||||||
|
shutdown_client.close().await.ok();
|
||||||
|
chromedriver.kill().ok();
|
||||||
|
std::process::exit(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Go to page
|
||||||
|
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
||||||
|
client.goto(url).await?;
|
||||||
|
|
||||||
|
/* let _ = client.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const overlay = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
|
if (overlay) {
|
||||||
|
overlay.style.display = "none";
|
||||||
|
overlay.style.visibility = "hidden";
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
})()"#,
|
||||||
|
vec![]
|
||||||
|
).await;
|
||||||
|
|
||||||
|
for _ in 0..5 {
|
||||||
|
// Check if the iframe exists
|
||||||
|
let overlay_hidden: bool = client.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
|
if (!iframe) return true; // already gone
|
||||||
|
|
||||||
|
// Try clicking button inside iframe via contentWindow
|
||||||
|
try {
|
||||||
|
const btn = iframe.contentWindow.document.querySelector('button');
|
||||||
|
if(btn) btn.click();
|
||||||
|
} catch(e) {}
|
||||||
|
|
||||||
|
// Hide the iframe itself
|
||||||
|
iframe.style.display = 'none';
|
||||||
|
iframe.style.visibility = 'hidden';
|
||||||
|
return false; // still hidden
|
||||||
|
})()"#,
|
||||||
|
vec![]
|
||||||
|
).await.ok()
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if overlay_hidden {
|
||||||
|
println!("Overlay hidden");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
client.enter_parent_frame().await.ok();*/
|
||||||
|
|
||||||
|
// Set start and end dates
|
||||||
|
let start_date = "2024-01-01";
|
||||||
|
let end_date = "2025-01-01";
|
||||||
|
|
||||||
|
let set_dates_script = format!(r#"
|
||||||
|
(() => {{
|
||||||
|
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
||||||
|
const toInput = document.querySelector('#dtTeletraderEndDate');
|
||||||
|
|
||||||
|
if (fromInput) {{
|
||||||
|
fromInput.value = '{}';
|
||||||
|
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||||
|
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||||
|
}}
|
||||||
|
|
||||||
|
if (toInput) {{
|
||||||
|
toInput.value = '{}';
|
||||||
|
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||||
|
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||||
|
}}
|
||||||
|
|
||||||
|
return !!fromInput && !!toInput;
|
||||||
|
}})()
|
||||||
|
"#, start_date, end_date);
|
||||||
|
|
||||||
|
// Execute JS to set dates and get the raw response
|
||||||
|
let _ = client.execute(&set_dates_script, vec![]).await;
|
||||||
|
|
||||||
|
// Give React time to process
|
||||||
|
sleep(Duration::from_millis(500)).await;
|
||||||
|
|
||||||
|
// Now read the values
|
||||||
|
let from_date_value: String = client.execute(
|
||||||
|
r#"return document.querySelector('#dtTeletraderFromDate')?.value;"#,
|
||||||
|
vec![],
|
||||||
|
).await?.as_str().unwrap_or_default().to_string();
|
||||||
|
|
||||||
|
let to_date_value: String = client.execute(
|
||||||
|
r#"return document.querySelector('#dtTeletraderEndDate')?.value;"#,
|
||||||
|
vec![],
|
||||||
|
).await?.as_str().unwrap_or_default().to_string();
|
||||||
|
|
||||||
|
println!("From Date: {}", from_date_value);
|
||||||
|
println!("To Date: {}", to_date_value);
|
||||||
|
|
||||||
|
if from_date_value == start_date && to_date_value == end_date {
|
||||||
|
println!("Dates set correctly");
|
||||||
|
} else {
|
||||||
|
println!("Date not set correctly");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hide any reappearing overlay
|
||||||
|
/*dismiss_overlays(&client).await?;
|
||||||
|
|
||||||
|
// Wait for the tab to appear and click it
|
||||||
|
if let Ok(_) = client.find(Locator::Css(
|
||||||
|
"#TeletraderForm > article.page-content__item.page-content__item--space.margin-bottom-1\\.00.margin-top-1\\.00-md > div.tab-region > nav > div > div > div.tab__item.tab__item--active"
|
||||||
|
)).await {
|
||||||
|
// Example: click "Hohe Relevanz" tab
|
||||||
|
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||||
|
tab.click().await?;
|
||||||
|
}
|
||||||
|
println!("Importance tab clicked");
|
||||||
|
} else {
|
||||||
|
println!("Importance tab not found");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait a bit for the table to load
|
||||||
|
sleep(Duration::from_secs(1)).await;*/
|
||||||
|
|
||||||
|
// Find all table rows
|
||||||
|
let rows = client.find_all(Locator::Css(
|
||||||
|
"#TeletraderForm table.table tbody tr"
|
||||||
|
)).await?;
|
||||||
|
|
||||||
|
println!("Found {} table rows", rows.len());
|
||||||
|
|
||||||
|
// HashMap to store "Termin" -> description
|
||||||
|
let mut event_type_map: HashMap<String, String> = HashMap::new();
|
||||||
|
|
||||||
|
let mut i = 0;
|
||||||
|
while i < rows.len() {
|
||||||
|
let row = &rows[i];
|
||||||
|
|
||||||
|
// Extract all cells
|
||||||
|
let cells = row.find_all(Locator::Css("td")).await?;
|
||||||
|
|
||||||
|
if cells.len() >= 5 {
|
||||||
|
// Get Termin column text
|
||||||
|
let termin_text = cells[4].text().await.unwrap_or_default();
|
||||||
|
|
||||||
|
// Check if next row is a hidden description row
|
||||||
|
if i + 1 < rows.len() {
|
||||||
|
let next_row = &rows[i + 1];
|
||||||
|
let class = next_row.attr("class").await.unwrap_or(None).unwrap_or_default();
|
||||||
|
if class.starts_with("table__td teletrader") {
|
||||||
|
// Get the hidden description
|
||||||
|
let desc_cell = next_row.find(Locator::Css("td")).await?;
|
||||||
|
let desc_text = desc_cell.text().await.unwrap_or_default();
|
||||||
|
event_type_map.insert(termin_text.clone(), desc_text);
|
||||||
|
i += 1; // skip next row since it's the hidden description
|
||||||
|
} else {
|
||||||
|
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract using JavaScript
|
||||||
|
let events = extract_all_data_via_js(&client).await?;
|
||||||
|
|
||||||
|
// Extract descriptions using JavaScript
|
||||||
|
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
||||||
|
|
||||||
|
// Merge descriptions with events
|
||||||
|
let events_with_descriptions: Vec<EconomicEvent> = events.clone().into_iter()
|
||||||
|
.map(|mut event| {
|
||||||
|
if let Some(description) = event_type_map.get(&event.event) {
|
||||||
|
event.description = description.clone();
|
||||||
|
}
|
||||||
|
event
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Run validation suite
|
||||||
|
validate_events(&events).await?;
|
||||||
|
check_data_consistency(&events).await;
|
||||||
|
|
||||||
|
// Final summary
|
||||||
|
println!("\n🎯 EXTRACTION SUMMARY:");
|
||||||
|
println!(" • Total high-importance events: {}", events.len());
|
||||||
|
println!(" • Date range: 2024-01-01 to 2025-01-01");
|
||||||
|
println!(" • Data quality: {}% complete",
|
||||||
|
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
||||||
|
|
||||||
|
// Export for further analysis
|
||||||
|
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
||||||
|
tokio::fs::write("economic_events.json", json).await?;
|
||||||
|
println!(" • Data exported to: economic_events.json");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for Ctrl+C
|
||||||
|
shutdown_handle.await.ok();
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user