getting on website; edit date and importance
This commit is contained in:
7
.gitignore
vendored
7
.gitignore
vendored
@@ -16,3 +16,10 @@ target/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
|
||||||
|
# Added by cargo
|
||||||
|
|
||||||
|
/target
|
||||||
|
|
||||||
|
/chromedriver_win32/*
|
||||||
1314
Cargo.lock
generated
Normal file
1314
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
12
Cargo.toml
Normal file
12
Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "WebScraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
fantoccini = { version = "0.21.5", default-features = false, features = ["native-tls"] }
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
anyhow = "1.0"
|
||||||
|
futures = "0.3"
|
||||||
|
serde_json = "1.0"
|
||||||
@@ -1,2 +1,4 @@
|
|||||||
# WebScraper
|
# WebScraper
|
||||||
|
|
||||||
|
https://chromedriver.storage.googleapis.com/index.html
|
||||||
|
https://googlechromelabs.github.io/chrome-for-testing/
|
||||||
27
chromedriver-win64/LICENSE.chromedriver
Normal file
27
chromedriver-win64/LICENSE.chromedriver
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
// Copyright 2015 The Chromium Authors
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
// modification, are permitted provided that the following conditions are
|
||||||
|
// met:
|
||||||
|
//
|
||||||
|
// * Redistributions of source code must retain the above copyright
|
||||||
|
// notice, this list of conditions and the following disclaimer.
|
||||||
|
// * Redistributions in binary form must reproduce the above
|
||||||
|
// copyright notice, this list of conditions and the following disclaimer
|
||||||
|
// in the documentation and/or other materials provided with the
|
||||||
|
// distribution.
|
||||||
|
// * Neither the name of Google LLC nor the names of its
|
||||||
|
// contributors may be used to endorse or promote products derived from
|
||||||
|
// this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
14682
chromedriver-win64/THIRD_PARTY_NOTICES.chromedriver
Normal file
14682
chromedriver-win64/THIRD_PARTY_NOTICES.chromedriver
Normal file
File diff suppressed because it is too large
Load Diff
BIN
chromedriver-win64/chromedriver.exe
Normal file
BIN
chromedriver-win64/chromedriver.exe
Normal file
Binary file not shown.
9
src/continents.json
Normal file
9
src/continents.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[
|
||||||
|
"afrika",
|
||||||
|
"asien",
|
||||||
|
"europa",
|
||||||
|
"nordamerika",
|
||||||
|
"suedamerika",
|
||||||
|
"antarktis",
|
||||||
|
"ozeanien"
|
||||||
|
]
|
||||||
54
src/countries.json
Normal file
54
src/countries.json
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
[
|
||||||
|
"aegypten",
|
||||||
|
"frankreich",
|
||||||
|
"litauen",
|
||||||
|
"schweiz",
|
||||||
|
"argentinien",
|
||||||
|
"griechenland",
|
||||||
|
"mexiko",
|
||||||
|
"singapur",
|
||||||
|
"australien",
|
||||||
|
"hongkong",
|
||||||
|
"neuseeland",
|
||||||
|
"slowakei",
|
||||||
|
"bahrain",
|
||||||
|
"indien",
|
||||||
|
"niederlande",
|
||||||
|
"spanien",
|
||||||
|
"belgien",
|
||||||
|
"indonesien",
|
||||||
|
"norwegen",
|
||||||
|
"suedafrika",
|
||||||
|
"brasilien",
|
||||||
|
"irland",
|
||||||
|
"oesterreich",
|
||||||
|
"suedkorea",
|
||||||
|
"chile",
|
||||||
|
"island",
|
||||||
|
"peru",
|
||||||
|
"taiwan",
|
||||||
|
"china",
|
||||||
|
"italien",
|
||||||
|
"philippinen",
|
||||||
|
"tschechien",
|
||||||
|
"daenemark",
|
||||||
|
"japan",
|
||||||
|
"polen",
|
||||||
|
"tuerkei",
|
||||||
|
"deutschland",
|
||||||
|
"kanada",
|
||||||
|
"portugal",
|
||||||
|
"ungarn",
|
||||||
|
"estland",
|
||||||
|
"katar",
|
||||||
|
"rumaenien",
|
||||||
|
"usa",
|
||||||
|
"eurozone",
|
||||||
|
"kolumbien",
|
||||||
|
"russland",
|
||||||
|
"vereinigte-arabische-emirate",
|
||||||
|
"finnland",
|
||||||
|
"lettland",
|
||||||
|
"schweden",
|
||||||
|
"vereinigtes-koenigreich"
|
||||||
|
]
|
||||||
305
src/main.rs
Normal file
305
src/main.rs
Normal file
@@ -0,0 +1,305 @@
|
|||||||
|
use fantoccini::{ClientBuilder, Locator};
|
||||||
|
use serde_json::{Map, Value};
|
||||||
|
use std::{collections::HashMap, process::Command};
|
||||||
|
use tokio::{time::{Duration, sleep}, signal};
|
||||||
|
use futures::future::join_all;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct EconomicEvent {
|
||||||
|
country: String,
|
||||||
|
date: String,
|
||||||
|
time: String,
|
||||||
|
event: String,
|
||||||
|
actual: String,
|
||||||
|
forecast: String,
|
||||||
|
previous: String,
|
||||||
|
importance: String,
|
||||||
|
description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn start_chromedriver(port: u16) -> std::process::Child {
|
||||||
|
Command::new("chromedriver-win64/chromedriver.exe")
|
||||||
|
.args(&[format!("--port={}", port)])
|
||||||
|
.spawn()
|
||||||
|
.expect("Failed to start ChromeDriver")
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn hide_contentpass_overlay(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||||
|
for _ in 0..20 {
|
||||||
|
let hidden: bool = client.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
|
if (!iframe) return true;
|
||||||
|
iframe.style.display = 'none';
|
||||||
|
iframe.style.visibility = 'hidden';
|
||||||
|
return false;
|
||||||
|
})()"#,
|
||||||
|
vec![]
|
||||||
|
).await?.as_bool().unwrap_or(false);
|
||||||
|
|
||||||
|
if hidden { break; }
|
||||||
|
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn scrape_events(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
|
// Find all rows
|
||||||
|
let rows = client.find_all(Locator::Css(
|
||||||
|
"#TeletraderForm table tbody tr"
|
||||||
|
)).await?;
|
||||||
|
|
||||||
|
let mut events = vec![];
|
||||||
|
let mut empty_count = 0;
|
||||||
|
let mut useful_count = 0;
|
||||||
|
|
||||||
|
let mut i = 0;
|
||||||
|
while i < rows.len() {
|
||||||
|
let cells = rows[i].find_all(Locator::Css("td")).await?;
|
||||||
|
let texts: Vec<String> = join_all(
|
||||||
|
cells.iter().map(|c| async move { c.text().await.unwrap_or_default() })
|
||||||
|
).await;
|
||||||
|
|
||||||
|
let mut description = String::new();
|
||||||
|
// Try to get the description from the next row if it exists
|
||||||
|
if i + 1 < rows.len() {
|
||||||
|
if let Ok(desc_row) = rows[i + 1].find(Locator::Css("td p")).await {
|
||||||
|
description = desc_row.text().await.unwrap_or_default();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let event = EconomicEvent {
|
||||||
|
country: texts.get(2).cloned().unwrap_or_default(),
|
||||||
|
date: "".to_string(),
|
||||||
|
time: texts.get(0).cloned().unwrap_or_default(),
|
||||||
|
event: texts.get(4).cloned().unwrap_or_default(),
|
||||||
|
actual: texts.get(7).cloned().unwrap_or_default(),
|
||||||
|
forecast: texts.get(6).cloned().unwrap_or_default(),
|
||||||
|
previous: texts.get(5).cloned().unwrap_or_default(),
|
||||||
|
importance: texts.get(3).cloned().unwrap_or_default(),
|
||||||
|
description,
|
||||||
|
};
|
||||||
|
|
||||||
|
if event.event.trim().is_empty() && event.country.trim().is_empty() {
|
||||||
|
empty_count += 1;
|
||||||
|
} else {
|
||||||
|
useful_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
events.push(event);
|
||||||
|
i += 2; // skip the description row
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Total events found: {}", events.len());
|
||||||
|
println!("Useful events: {}", useful_count);
|
||||||
|
println!("Empty events: {}", empty_count);
|
||||||
|
|
||||||
|
for e in events.iter().filter(|ev| !ev.event.trim().is_empty() && !ev.country.trim().is_empty()) {
|
||||||
|
println!("{:?}", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*for e in &events {
|
||||||
|
println!("{:#?}", e);
|
||||||
|
}*/
|
||||||
|
|
||||||
|
Ok(events)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
let port = 9515; // pick a port you like
|
||||||
|
let mut chromedriver = start_chromedriver(port);
|
||||||
|
sleep(Duration::from_secs(2)).await; // wait for ChromeDriver to start
|
||||||
|
|
||||||
|
// Chrome options (non-headless so it opens)
|
||||||
|
let caps_value = serde_json::json!({
|
||||||
|
"goog:chromeOptions": {
|
||||||
|
"args": [
|
||||||
|
//"--headless",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-notifications",
|
||||||
|
"--disable-popup-blocking",
|
||||||
|
"--disable-blink-features=AutomationControlled"
|
||||||
|
],
|
||||||
|
"excludeSwitches": ["enable-automation"]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let caps_map: Map<String, Value> = caps_value.as_object()
|
||||||
|
.expect("Capabilities should be a JSON object")
|
||||||
|
.clone();
|
||||||
|
|
||||||
|
let mut client = ClientBuilder::native()
|
||||||
|
.capabilities(caps_map)
|
||||||
|
.connect(&format!("http://localhost:{}", port))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Setup graceful shutdown on Ctrl+C
|
||||||
|
let shutdown_client = client.clone();
|
||||||
|
let shutdown_handle = tokio::spawn(async move {
|
||||||
|
signal::ctrl_c().await.expect("Failed to listen for ctrl+c");
|
||||||
|
println!("\nCtrl+C received, shutting down...");
|
||||||
|
shutdown_client.close().await.ok();
|
||||||
|
chromedriver.kill().ok();
|
||||||
|
std::process::exit(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Go to page
|
||||||
|
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
||||||
|
client.goto(url).await?;
|
||||||
|
|
||||||
|
let _ = client.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const overlay = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
|
if (overlay) {
|
||||||
|
overlay.style.display = "none";
|
||||||
|
overlay.style.visibility = "hidden";
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
})()"#,
|
||||||
|
vec![]
|
||||||
|
).await;
|
||||||
|
|
||||||
|
for _ in 0..30 {
|
||||||
|
// Check if the iframe exists
|
||||||
|
let overlay_hidden: bool = client.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
|
if (!iframe) return true; // already gone
|
||||||
|
|
||||||
|
// Try clicking button inside iframe via contentWindow
|
||||||
|
try {
|
||||||
|
const btn = iframe.contentWindow.document.querySelector('button');
|
||||||
|
if(btn) btn.click();
|
||||||
|
} catch(e) {}
|
||||||
|
|
||||||
|
// Hide the iframe itself
|
||||||
|
iframe.style.display = 'none';
|
||||||
|
iframe.style.visibility = 'hidden';
|
||||||
|
return false; // still hidden
|
||||||
|
})()"#,
|
||||||
|
vec![]
|
||||||
|
).await.ok()
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if overlay_hidden {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
client.enter_parent_frame().await.ok();
|
||||||
|
|
||||||
|
// Set start and end dates
|
||||||
|
let start_date = "2024-01-01";
|
||||||
|
let end_date = "2025-01-01";
|
||||||
|
|
||||||
|
let set_dates_script = format!(r#"
|
||||||
|
(() => {{
|
||||||
|
const fromInput = document.querySelector('#dtTeletraderFromDate');
|
||||||
|
const toInput = document.querySelector('#dtTeletraderEndDate');
|
||||||
|
|
||||||
|
if (fromInput) {{
|
||||||
|
fromInput.value = '{}';
|
||||||
|
fromInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||||
|
fromInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||||
|
}}
|
||||||
|
|
||||||
|
if (toInput) {{
|
||||||
|
toInput.value = '{}';
|
||||||
|
toInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||||
|
toInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||||
|
}}
|
||||||
|
|
||||||
|
return !!fromInput && !!toInput;
|
||||||
|
}})()
|
||||||
|
"#, start_date, end_date);
|
||||||
|
|
||||||
|
let dates_set = client.execute(&set_dates_script, vec![])
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if dates_set {
|
||||||
|
println!("Dates set successfully from {} to {}", start_date, end_date);
|
||||||
|
} else {
|
||||||
|
println!("Failed to set dates");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hide any reappearing overlay
|
||||||
|
hide_contentpass_overlay(&client).await?;
|
||||||
|
|
||||||
|
// Wait for the tab to appear and click it
|
||||||
|
if let Ok(_) = client.find(Locator::Css(
|
||||||
|
"#TeletraderForm > article.page-content__item.page-content__item--space.margin-bottom-1\\.00.margin-top-1\\.00-md > div.tab-region > nav > div > div > div.tab__item.tab__item--active"
|
||||||
|
)).await {
|
||||||
|
// Example: click "Hohe Relevanz" tab
|
||||||
|
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||||
|
tab.click().await?;
|
||||||
|
}
|
||||||
|
println!("Importance tab clicked");
|
||||||
|
} else {
|
||||||
|
println!("Importance tab not found");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait a bit for the table to load
|
||||||
|
sleep(Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
// Find all table rows
|
||||||
|
let rows = client.find_all(Locator::Css(
|
||||||
|
"#TeletraderForm table.table tbody tr"
|
||||||
|
)).await?;
|
||||||
|
|
||||||
|
println!("Found {} table rows", rows.len());
|
||||||
|
|
||||||
|
// HashMap to store "Termin" -> description
|
||||||
|
let mut termin_map: HashMap<String, String> = HashMap::new();
|
||||||
|
|
||||||
|
let mut i = 0;
|
||||||
|
while i < rows.len() {
|
||||||
|
let row = &rows[i];
|
||||||
|
|
||||||
|
// Extract all cells
|
||||||
|
let cells = row.find_all(Locator::Css("td")).await?;
|
||||||
|
|
||||||
|
if cells.len() >= 5 {
|
||||||
|
// Get Termin column text
|
||||||
|
let termin_text = cells[4].text().await.unwrap_or_default();
|
||||||
|
|
||||||
|
// Check if next row is a hidden description row
|
||||||
|
if i + 1 < rows.len() {
|
||||||
|
let next_row = &rows[i + 1];
|
||||||
|
let class = next_row.attr("class").await.unwrap_or(None).unwrap_or_default();
|
||||||
|
if class.starts_with("table__td teletrader") {
|
||||||
|
// Get the hidden description
|
||||||
|
let desc_cell = next_row.find(Locator::Css("td")).await?;
|
||||||
|
let desc_text = desc_cell.text().await.unwrap_or_default();
|
||||||
|
termin_map.insert(termin_text.clone(), desc_text);
|
||||||
|
i += 1; // skip next row since it's the hidden description
|
||||||
|
} else {
|
||||||
|
termin_map.insert(termin_text.clone(), "".to_string());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
termin_map.insert(termin_text.clone(), "".to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let events = scrape_events(&client).await?;
|
||||||
|
|
||||||
|
println!("Collected {} Termin entries", termin_map.len());
|
||||||
|
for (k, v) in &termin_map {
|
||||||
|
println!("{:?} => {:?}", k, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for Ctrl+C
|
||||||
|
shutdown_handle.await.ok();
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user