getting on website; edit date and importance

This commit is contained in:
2025-11-16 02:06:25 +01:00
parent 645f7a546b
commit e6729b06b8
10 changed files with 16412 additions and 0 deletions

7
.gitignore vendored
View File

@@ -16,3 +16,10 @@ target/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
# Added by cargo
/target
/chromedriver_win32/*

1314
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

12
Cargo.toml Normal file
View File

@@ -0,0 +1,12 @@
[package]
name = "WebScraper"
version = "0.1.0"
edition = "2024"
[dependencies]
fantoccini = { version = "0.21.5", default-features = false, features = ["native-tls"] }
tokio = { version = "1", features = ["full"] }
serde = { version = "1", features = ["derive"] }
anyhow = "1.0"
futures = "0.3"
serde_json = "1.0"

View File

@@ -1,2 +1,4 @@
# WebScraper # WebScraper
https://chromedriver.storage.googleapis.com/index.html
https://googlechromelabs.github.io/chrome-for-testing/

View File

@@ -0,0 +1,27 @@
// Copyright 2015 The Chromium Authors
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google LLC nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

File diff suppressed because it is too large Load Diff

Binary file not shown.

9
src/continents.json Normal file
View File

@@ -0,0 +1,9 @@
[
"afrika",
"asien",
"europa",
"nordamerika",
"suedamerika",
"antarktis",
"ozeanien"
]

54
src/countries.json Normal file
View File

@@ -0,0 +1,54 @@
[
"aegypten",
"frankreich",
"litauen",
"schweiz",
"argentinien",
"griechenland",
"mexiko",
"singapur",
"australien",
"hongkong",
"neuseeland",
"slowakei",
"bahrain",
"indien",
"niederlande",
"spanien",
"belgien",
"indonesien",
"norwegen",
"suedafrika",
"brasilien",
"irland",
"oesterreich",
"suedkorea",
"chile",
"island",
"peru",
"taiwan",
"china",
"italien",
"philippinen",
"tschechien",
"daenemark",
"japan",
"polen",
"tuerkei",
"deutschland",
"kanada",
"portugal",
"ungarn",
"estland",
"katar",
"rumaenien",
"usa",
"eurozone",
"kolumbien",
"russland",
"vereinigte-arabische-emirate",
"finnland",
"lettland",
"schweden",
"vereinigtes-koenigreich"
]

305
src/main.rs Normal file
View File

@@ -0,0 +1,305 @@
use fantoccini::{ClientBuilder, Locator};
use serde_json::{Map, Value};
use std::{collections::HashMap, process::Command};
use tokio::{time::{Duration, sleep}, signal};
use futures::future::join_all;
#[derive(Debug)]
struct EconomicEvent {
country: String,
date: String,
time: String,
event: String,
actual: String,
forecast: String,
previous: String,
importance: String,
description: String,
}
fn start_chromedriver(port: u16) -> std::process::Child {
Command::new("chromedriver-win64/chromedriver.exe")
.args(&[format!("--port={}", port)])
.spawn()
.expect("Failed to start ChromeDriver")
}
async fn hide_contentpass_overlay(client: &fantoccini::Client) -> anyhow::Result<()> {
for _ in 0..20 {
let hidden: bool = client.execute(
r#"(() => {
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
if (!iframe) return true;
iframe.style.display = 'none';
iframe.style.visibility = 'hidden';
return false;
})()"#,
vec![]
).await?.as_bool().unwrap_or(false);
if hidden { break; }
tokio::time::sleep(Duration::from_millis(500)).await;
}
Ok(())
}
async fn scrape_events(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
// Find all rows
let rows = client.find_all(Locator::Css(
"#TeletraderForm table tbody tr"
)).await?;
let mut events = vec![];
let mut empty_count = 0;
let mut useful_count = 0;
let mut i = 0;
while i < rows.len() {
let cells = rows[i].find_all(Locator::Css("td")).await?;
let texts: Vec<String> = join_all(
cells.iter().map(|c| async move { c.text().await.unwrap_or_default() })
).await;
let mut description = String::new();
// Try to get the description from the next row if it exists
if i + 1 < rows.len() {
if let Ok(desc_row) = rows[i + 1].find(Locator::Css("td p")).await {
description = desc_row.text().await.unwrap_or_default();
}
}
let event = EconomicEvent {
country: texts.get(2).cloned().unwrap_or_default(),
date: "".to_string(),
time: texts.get(0).cloned().unwrap_or_default(),
event: texts.get(4).cloned().unwrap_or_default(),
actual: texts.get(7).cloned().unwrap_or_default(),
forecast: texts.get(6).cloned().unwrap_or_default(),
previous: texts.get(5).cloned().unwrap_or_default(),
importance: texts.get(3).cloned().unwrap_or_default(),
description,
};
if event.event.trim().is_empty() && event.country.trim().is_empty() {
empty_count += 1;
} else {
useful_count += 1;
}
events.push(event);
i += 2; // skip the description row
}
println!("Total events found: {}", events.len());
println!("Useful events: {}", useful_count);
println!("Empty events: {}", empty_count);
for e in events.iter().filter(|ev| !ev.event.trim().is_empty() && !ev.country.trim().is_empty()) {
println!("{:?}", e);
}
/*for e in &events {
println!("{:#?}", e);
}*/
Ok(events)
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let port = 9515; // pick a port you like
let mut chromedriver = start_chromedriver(port);
sleep(Duration::from_secs(2)).await; // wait for ChromeDriver to start
// Chrome options (non-headless so it opens)
let caps_value = serde_json::json!({
"goog:chromeOptions": {
"args": [
//"--headless",
"--disable-gpu",
"--disable-notifications",
"--disable-popup-blocking",
"--disable-blink-features=AutomationControlled"
],
"excludeSwitches": ["enable-automation"]
}
});
let caps_map: Map<String, Value> = caps_value.as_object()
.expect("Capabilities should be a JSON object")
.clone();
let mut client = ClientBuilder::native()
.capabilities(caps_map)
.connect(&format!("http://localhost:{}", port))
.await?;
// Setup graceful shutdown on Ctrl+C
let shutdown_client = client.clone();
let shutdown_handle = tokio::spawn(async move {
signal::ctrl_c().await.expect("Failed to listen for ctrl+c");
println!("\nCtrl+C received, shutting down...");
shutdown_client.close().await.ok();
chromedriver.kill().ok();
std::process::exit(0);
});
// Go to page
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
client.goto(url).await?;
let _ = client.execute(
r#"(() => {
const overlay = document.querySelector('iframe[title="Contentpass First Layer"]');
if (overlay) {
overlay.style.display = "none";
overlay.style.visibility = "hidden";
return true;
}
return false;
})()"#,
vec![]
).await;
for _ in 0..30 {
// Check if the iframe exists
let overlay_hidden: bool = client.execute(
r#"(() => {
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
if (!iframe) return true; // already gone
// Try clicking button inside iframe via contentWindow
try {
const btn = iframe.contentWindow.document.querySelector('button');
if(btn) btn.click();
} catch(e) {}
// Hide the iframe itself
iframe.style.display = 'none';
iframe.style.visibility = 'hidden';
return false; // still hidden
})()"#,
vec![]
).await.ok()
.and_then(|v| v.as_bool())
.unwrap_or(false);
if overlay_hidden {
break;
}
sleep(Duration::from_millis(500)).await;
}
client.enter_parent_frame().await.ok();
// Set start and end dates
let start_date = "2024-01-01";
let end_date = "2025-01-01";
let set_dates_script = format!(r#"
(() => {{
const fromInput = document.querySelector('#dtTeletraderFromDate');
const toInput = document.querySelector('#dtTeletraderEndDate');
if (fromInput) {{
fromInput.value = '{}';
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}}
if (toInput) {{
toInput.value = '{}';
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}}
return !!fromInput && !!toInput;
}})()
"#, start_date, end_date);
let dates_set = client.execute(&set_dates_script, vec![])
.await
.ok()
.and_then(|v| v.as_bool())
.unwrap_or(false);
if dates_set {
println!("Dates set successfully from {} to {}", start_date, end_date);
} else {
println!("Failed to set dates");
}
// Hide any reappearing overlay
hide_contentpass_overlay(&client).await?;
// Wait for the tab to appear and click it
if let Ok(_) = client.find(Locator::Css(
"#TeletraderForm > article.page-content__item.page-content__item--space.margin-bottom-1\\.00.margin-top-1\\.00-md > div.tab-region > nav > div > div > div.tab__item.tab__item--active"
)).await {
// Example: click "Hohe Relevanz" tab
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
tab.click().await?;
}
println!("Importance tab clicked");
} else {
println!("Importance tab not found");
}
// Wait a bit for the table to load
sleep(Duration::from_secs(2)).await;
// Find all table rows
let rows = client.find_all(Locator::Css(
"#TeletraderForm table.table tbody tr"
)).await?;
println!("Found {} table rows", rows.len());
// HashMap to store "Termin" -> description
let mut termin_map: HashMap<String, String> = HashMap::new();
let mut i = 0;
while i < rows.len() {
let row = &rows[i];
// Extract all cells
let cells = row.find_all(Locator::Css("td")).await?;
if cells.len() >= 5 {
// Get Termin column text
let termin_text = cells[4].text().await.unwrap_or_default();
// Check if next row is a hidden description row
if i + 1 < rows.len() {
let next_row = &rows[i + 1];
let class = next_row.attr("class").await.unwrap_or(None).unwrap_or_default();
if class.starts_with("table__td teletrader") {
// Get the hidden description
let desc_cell = next_row.find(Locator::Css("td")).await?;
let desc_text = desc_cell.text().await.unwrap_or_default();
termin_map.insert(termin_text.clone(), desc_text);
i += 1; // skip next row since it's the hidden description
} else {
termin_map.insert(termin_text.clone(), "".to_string());
}
} else {
termin_map.insert(termin_text.clone(), "".to_string());
}
}
i += 1;
}
let events = scrape_events(&client).await?;
println!("Collected {} Termin entries", termin_map.len());
for (k, v) in &termin_map {
println!("{:?} => {:?}", k, v);
}
// Wait for Ctrl+C
shutdown_handle.await.ok();
Ok(())
}