adding corporate data to webscraper

This commit is contained in:
2025-11-21 00:17:59 +01:00
parent 0ea3fcc3b5
commit 9d0d15f3f8
18 changed files with 2128 additions and 970 deletions

View File

@@ -0,0 +1,60 @@
// src/economic/extraction_script.js
const events = [];
let currentDate = '';
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const cells = row.querySelectorAll('td');
if (cells.length === 1 && cells[0].colSpan === 9) {
const dateText = cells[0].textContent.trim();
const monthMap = {
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
};
const match = dateText.match(/(\d{1,2})\.\s+([a-zA-ZäöüßÄÖÜ]+)\s+(\d{4})/);
if (match) {
const day = match[1].padStart(2, '0');
const month = monthMap[match[2]] || '01';
const year = match[3];
currentDate = `${year}-${month}-${day}`;
} else {
currentDate = '';
}
continue;
}
if (cells.length >= 8) {
const time = cells[0]?.textContent?.trim() || '';
const country = cells[2]?.textContent?.trim() || '';
const eventName = cells[4]?.textContent?.trim() || '';
if (!time || !country || !eventName) continue;
const yellowStars = cells[3]?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
if (yellowStars !== 3) continue;
let description = '';
if (i + 1 < rows.length) {
const next = rows[i + 1];
const descP = next.querySelector('p');
if (descP) description = descP.textContent?.trim() || '';
}
events.push({
country,
date: currentDate,
time,
event: eventName,
actual: cells[7]?.textContent?.trim() || '',
forecast: cells[6]?.textContent?.trim() || '',
previous: cells[5]?.textContent?.trim() || '',
importance: 'High',
description
});
}
}
return events;

62
src/economic/helpers.rs Normal file
View File

@@ -0,0 +1,62 @@
// src/economic/helpers.rs
use super::types::*;
use chrono::{Local, NaiveDate};
use std::collections::{HashMap, HashSet};
pub fn event_key(e: &EconomicEvent) -> String {
format!("{}|{}|{}", e.date, e.time, e.event)
}
pub fn identity_key(e: &EconomicEvent) -> String {
format!("{}|{}|{}", e.country, e.event, e.date.split('-').take(2).collect::<Vec<_>>().join("-"))
}
pub fn build_identity_lookup(events: &HashMap<String, EconomicEvent>) -> HashMap<String, (String, EconomicEvent)> {
let mut map = HashMap::new();
for (k, e) in events {
map.insert(identity_key(e), (k.clone(), e.clone()));
}
map
}
pub fn build_date_event_lookup(
events: &HashMap<String, EconomicEvent>,
) -> HashMap<String, Vec<(String, EconomicEvent)>> {
let mut map: HashMap<String, Vec<(String, EconomicEvent)>> = HashMap::new();
for (k, e) in events {
let key = format!("{}|{}|{}", e.country, e.event, e.date);
map.entry(key).or_default().push((k.clone(), e.clone()));
}
map
}
pub fn detect_changes(old: &EconomicEvent, new: &EconomicEvent, today: &str) -> Vec<EventChange> {
let mut changes = Vec::new();
let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
if new.date.as_str() <= today { return changes; }
let fields = [
("actual", &old.actual, &new.actual),
("forecast", &old.forecast, &new.forecast),
("previous", &old.previous, &new.previous),
("description", &old.description, &new.description),
];
for (field, old_val, new_val) in fields {
if old_val != new_val {
changes.push(EventChange {
date: new.date.clone(),
event: new.event.clone(),
country: new.country.clone(),
change_type: field.to_string(),
field_changed: field.to_string(),
old_value: old_val.clone(),
new_value: new_val.clone(),
detected_at: ts.clone(),
});
}
}
changes
}

11
src/economic/mod.rs Normal file
View File

@@ -0,0 +1,11 @@
// src/economic/mod.rs
pub mod types;
pub mod scraper;
pub mod storage;
pub mod update;
pub mod helpers;
pub use types::*;
pub use scraper::*;
pub use update::run_full_update;
pub use helpers::*;

84
src/economic/scraper.rs Normal file
View File

@@ -0,0 +1,84 @@
// src/economic/scraper.rs
use super::types::{EconomicEvent, ScrapeResult};
use fantoccini::Client;
use tokio::time::{sleep, Duration};
use chrono::{Local, NaiveDate};
const EXTRACTION_JS: &str = include_str!("extraction_script.js");
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
dismiss_overlays(client).await?;
if let Ok(tab) = client.find(fantoccini::Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
tab.click().await?;
println!("High importance tab selected");
sleep(Duration::from_secs(2)).await;
}
Ok(())
}
pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
for _ in 0..10 {
let removed: bool = client
.execute(
r#"(() => {
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
if (iframe && iframe.parentNode) {
iframe.parentNode.removeChild(iframe);
return true;
}
return false;
})()"#,
vec![],
)
.await?
.as_bool()
.unwrap_or(false);
if removed { break; }
sleep(Duration::from_millis(500)).await;
}
Ok(())
}
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
let script = format!(
r#"
(() => {{
const from = document.querySelector('#dtTeletraderFromDate');
const to = document.querySelector('#dtTeletraderEndDate');
if (from) {{ from.value = '{}'; from.dispatchEvent(new Event('change', {{bubbles: true}})); }}
if (to) {{ to.value = '{}'; to.dispatchEvent(new Event('change', {{bubbles: true}})); }}
return true;
}})()
"#,
start, end
);
client.execute(&script, vec![]).await?;
sleep(Duration::from_millis(1200)).await;
Ok(())
}
pub async fn extract_events(client: &Client) -> anyhow::Result<Vec<EconomicEvent>> {
let result = client.execute(EXTRACTION_JS, vec![]).await?;
let array = result.as_array().ok_or_else(|| anyhow::anyhow!("Expected array"))?;
let mut events = Vec::with_capacity(array.len());
for val in array {
if let Some(obj) = val.as_object() {
events.push(EconomicEvent {
country: obj["country"].as_str().unwrap_or("").to_string(),
date: obj["date"].as_str().unwrap_or("").to_string(),
time: obj["time"].as_str().unwrap_or("").to_string(),
event: obj["event"].as_str().unwrap_or("").to_string(),
actual: obj["actual"].as_str().unwrap_or("").to_string(),
forecast: obj["forecast"].as_str().unwrap_or("").to_string(),
previous: obj["previous"].as_str().unwrap_or("").to_string(),
importance: "High".to_string(),
description: obj["description"].as_str().unwrap_or("").to_string(),
});
}
}
println!("Extracted {} high-impact events", events.len());
Ok(events)
}

113
src/economic/storage.rs Normal file
View File

@@ -0,0 +1,113 @@
use std::collections::HashMap;
// src/economic/storage.rs
use super::types::*;
use super::helpers::*;
use tokio::fs;
use chrono::{Local, NaiveDate, Datelike};
pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
let dir = std::path::Path::new("economic_events");
let mut chunks = Vec::new();
if dir.exists() {
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().map(|e| e == "json").unwrap_or(false) {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.starts_with("chunk_") {
if let Some(content) = fs::read_to_string(&path).await.ok() {
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
let start = name[6..16].to_string();
let end = name[17..27].to_string();
chunks.push(ChunkInfo { start_date: start, end_date: end, path, event_count: events.len() });
}
}
}
}
}
}
}
chunks.sort_by_key(|c| c.start_date.clone());
Ok(chunks)
}
pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMap<String, EconomicEvent>> {
let mut map = HashMap::new();
for chunk in chunks {
let content = fs::read_to_string(&chunk.path).await?;
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
for e in events {
map.insert(event_key(&e), e);
}
}
Ok(map)
}
pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
let dir = std::path::Path::new("economic_events");
fs::create_dir_all(dir).await?;
let mut sorted: Vec<_> = events.into_values().collect();
sorted.sort_by_key(|e| e.date.clone());
let mut chunk = Vec::new();
let mut start: Option<NaiveDate> = None;
for e in sorted {
let date = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d")?;
if let Some(s) = start {
if (date - s).num_days() > 100 || chunk.len() >= 500 {
save_chunk(&chunk, dir).await?;
chunk.clear();
start = Some(date);
}
} else {
start = Some(date);
}
chunk.push(e);
}
if !chunk.is_empty() {
save_chunk(&chunk, dir).await?;
}
Ok(())
}
async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::Result<()> {
let start = events.iter().map(|e| &e.date).min().unwrap().clone();
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
let path = dir.join(format!("chunk_{}_{}.json", start, end));
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
Ok(())
}
pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
if changes.is_empty() { return Ok(()); }
let dir = std::path::Path::new("economic_event_changes");
fs::create_dir_all(dir).await?;
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
for c in changes {
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
let key = format!("{:02}_{}", d.month(), d.year());
by_month.entry(key).or_default().push(c.clone());
}
}
for (month, list) in by_month {
let path = dir.join(format!("event_changes_{}.json", month));
let mut all = if path.exists() {
let s = fs::read_to_string(&path).await?;
serde_json::from_str(&s).unwrap_or_default()
} else { vec![] };
all.extend(list);
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
}
Ok(())
}
pub fn target_end_date() -> String {
let now = Local::now().naive_local().date();
let future = now + chrono::Duration::days(90);
future.format("%Y-%m-%d").to_string()
}

41
src/economic/types.rs Normal file
View File

@@ -0,0 +1,41 @@
// src/economic/types.rs
use serde::{Deserialize, Serialize};
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
pub struct EconomicEvent {
pub country: String,
pub date: String, // YYYY-MM-DD
pub time: String,
pub event: String,
pub actual: String,
pub forecast: String,
pub previous: String,
pub importance: String,
pub description: String,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct EventChange {
pub date: String,
pub event: String,
pub country: String,
pub change_type: String, // actual|forecast|time|newly_added|removed
pub field_changed: String,
pub old_value: String,
pub new_value: String,
pub detected_at: String,
}
#[derive(Debug)]
pub struct ChunkInfo {
pub start_date: String,
pub end_date: String,
pub path: std::path::PathBuf,
pub event_count: usize,
}
#[derive(Debug)]
pub struct ScrapeResult {
pub changes: Vec<EventChange>,
pub removed_keys: std::collections::HashSet<String>,
}

116
src/economic/update.rs Normal file
View File

@@ -0,0 +1,116 @@
// src/economic/update.rs
use super::{scraper::*, storage::*, helpers::*, types::*};
use crate::config::Config;
use chrono::{Local, NaiveDate};
pub async fn run_full_update(client: &fantoccini::Client, config: &Config) -> anyhow::Result<()> {
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
let end_date = config.target_end_date();
let chunks = scan_existing_chunks().await?;
let mut events = load_existing_events(&chunks).await?;
println!("Loaded {} events from {} chunks", events.len(), chunks.len());
let start_date = if events.is_empty() {
config.economic_start_date.clone()
} else if events.values().any(|e| e.date >= today_str) {
today_str.clone()
} else {
events.values()
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
.max()
.and_then(|d| d.succ_opt())
.map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or(today_str.clone())
};
println!("Scraping economic events: {}{}", start_date, end_date);
let mut current = start_date;
let mut total_changes = 0;
while current <= end_date {
set_date_range(client, &current, &end_date).await?;
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
let new_events = extract_events(client).await?;
if new_events.is_empty() { break; }
let result = process_batch(&new_events, &mut events, &today_str);
total_changes += result.changes.len();
save_changes(&result.changes).await?;
let next = new_events.iter()
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
.max()
.and_then(|d| d.succ_opt())
.map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or(end_date.clone());
if next > end_date { break; }
current = next;
}
save_optimized_chunks(events).await?;
println!("Economic update complete — {} changes detected", total_changes);
Ok(())
}
pub fn process_batch(
new_events: &[EconomicEvent],
existing: &mut std::collections::HashMap<String, EconomicEvent>,
today: &str,
) -> ScrapeResult {
let mut changes = Vec::new();
let mut removed = std::collections::HashSet::new();
let identity_map = build_identity_lookup(existing);
let date_map = build_date_event_lookup(existing);
for new in new_events {
let key = event_key(new);
if let Some(old) = existing.get(&key) {
changes.extend(detect_changes(old, new, today));
existing.insert(key, new.clone());
continue;
}
let date_key = format!("{}|{}|{}", new.country, new.event, new.date);
if let Some(occurrences) = date_map.get(&date_key) {
if let Some((old_key, old_event)) = occurrences.iter().find(|(k, _)| *k != key) {
if new.date.as_str() > today {
changes.push(EventChange {
date: new.date.clone(),
event: new.event.clone(),
country: new.country.clone(),
change_type: "time".to_string(),
field_changed: "time".to_string(),
old_value: old_event.time.clone(),
new_value: new.time.clone(),
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
});
}
removed.insert(old_key.clone());
existing.remove(old_key);
}
}
if new.date.as_str() > today {
changes.push(EventChange {
date: new.date.clone(),
event: new.event.clone(),
country: new.country.clone(),
change_type: "newly_added".to_string(),
field_changed: "new_event".to_string(),
old_value: "".to_string(),
new_value: format!("{} @ {}", new.date, new.time),
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
});
}
existing.insert(key, new.clone());
}
ScrapeResult { changes, removed_keys: removed }
}