adding corporate data to webscraper
This commit is contained in:
60
src/economic/extraction_script.js
Normal file
60
src/economic/extraction_script.js
Normal file
@@ -0,0 +1,60 @@
|
||||
// src/economic/extraction_script.js
|
||||
const events = [];
|
||||
let currentDate = '';
|
||||
|
||||
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
|
||||
|
||||
for (let i = 0; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
const cells = row.querySelectorAll('td');
|
||||
|
||||
if (cells.length === 1 && cells[0].colSpan === 9) {
|
||||
const dateText = cells[0].textContent.trim();
|
||||
const monthMap = {
|
||||
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
|
||||
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
|
||||
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
|
||||
};
|
||||
const match = dateText.match(/(\d{1,2})\.\s+([a-zA-ZäöüßÄÖÜ]+)\s+(\d{4})/);
|
||||
if (match) {
|
||||
const day = match[1].padStart(2, '0');
|
||||
const month = monthMap[match[2]] || '01';
|
||||
const year = match[3];
|
||||
currentDate = `${year}-${month}-${day}`;
|
||||
} else {
|
||||
currentDate = '';
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells.length >= 8) {
|
||||
const time = cells[0]?.textContent?.trim() || '';
|
||||
const country = cells[2]?.textContent?.trim() || '';
|
||||
const eventName = cells[4]?.textContent?.trim() || '';
|
||||
if (!time || !country || !eventName) continue;
|
||||
|
||||
const yellowStars = cells[3]?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
|
||||
if (yellowStars !== 3) continue;
|
||||
|
||||
let description = '';
|
||||
if (i + 1 < rows.length) {
|
||||
const next = rows[i + 1];
|
||||
const descP = next.querySelector('p');
|
||||
if (descP) description = descP.textContent?.trim() || '';
|
||||
}
|
||||
|
||||
events.push({
|
||||
country,
|
||||
date: currentDate,
|
||||
time,
|
||||
event: eventName,
|
||||
actual: cells[7]?.textContent?.trim() || '',
|
||||
forecast: cells[6]?.textContent?.trim() || '',
|
||||
previous: cells[5]?.textContent?.trim() || '',
|
||||
importance: 'High',
|
||||
description
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return events;
|
||||
62
src/economic/helpers.rs
Normal file
62
src/economic/helpers.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
// src/economic/helpers.rs
|
||||
use super::types::*;
|
||||
use chrono::{Local, NaiveDate};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
pub fn event_key(e: &EconomicEvent) -> String {
|
||||
format!("{}|{}|{}", e.date, e.time, e.event)
|
||||
}
|
||||
|
||||
pub fn identity_key(e: &EconomicEvent) -> String {
|
||||
format!("{}|{}|{}", e.country, e.event, e.date.split('-').take(2).collect::<Vec<_>>().join("-"))
|
||||
}
|
||||
|
||||
pub fn build_identity_lookup(events: &HashMap<String, EconomicEvent>) -> HashMap<String, (String, EconomicEvent)> {
|
||||
let mut map = HashMap::new();
|
||||
for (k, e) in events {
|
||||
map.insert(identity_key(e), (k.clone(), e.clone()));
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
pub fn build_date_event_lookup(
|
||||
events: &HashMap<String, EconomicEvent>,
|
||||
) -> HashMap<String, Vec<(String, EconomicEvent)>> {
|
||||
let mut map: HashMap<String, Vec<(String, EconomicEvent)>> = HashMap::new();
|
||||
|
||||
for (k, e) in events {
|
||||
let key = format!("{}|{}|{}", e.country, e.event, e.date);
|
||||
map.entry(key).or_default().push((k.clone(), e.clone()));
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
pub fn detect_changes(old: &EconomicEvent, new: &EconomicEvent, today: &str) -> Vec<EventChange> {
|
||||
let mut changes = Vec::new();
|
||||
let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
|
||||
|
||||
if new.date.as_str() <= today { return changes; }
|
||||
|
||||
let fields = [
|
||||
("actual", &old.actual, &new.actual),
|
||||
("forecast", &old.forecast, &new.forecast),
|
||||
("previous", &old.previous, &new.previous),
|
||||
("description", &old.description, &new.description),
|
||||
];
|
||||
|
||||
for (field, old_val, new_val) in fields {
|
||||
if old_val != new_val {
|
||||
changes.push(EventChange {
|
||||
date: new.date.clone(),
|
||||
event: new.event.clone(),
|
||||
country: new.country.clone(),
|
||||
change_type: field.to_string(),
|
||||
field_changed: field.to_string(),
|
||||
old_value: old_val.clone(),
|
||||
new_value: new_val.clone(),
|
||||
detected_at: ts.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
changes
|
||||
}
|
||||
11
src/economic/mod.rs
Normal file
11
src/economic/mod.rs
Normal file
@@ -0,0 +1,11 @@
|
||||
// src/economic/mod.rs
|
||||
pub mod types;
|
||||
pub mod scraper;
|
||||
pub mod storage;
|
||||
pub mod update;
|
||||
pub mod helpers;
|
||||
|
||||
pub use types::*;
|
||||
pub use scraper::*;
|
||||
pub use update::run_full_update;
|
||||
pub use helpers::*;
|
||||
84
src/economic/scraper.rs
Normal file
84
src/economic/scraper.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
// src/economic/scraper.rs
|
||||
use super::types::{EconomicEvent, ScrapeResult};
|
||||
use fantoccini::Client;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use chrono::{Local, NaiveDate};
|
||||
|
||||
const EXTRACTION_JS: &str = include_str!("extraction_script.js");
|
||||
|
||||
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
||||
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
|
||||
dismiss_overlays(client).await?;
|
||||
|
||||
if let Ok(tab) = client.find(fantoccini::Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||
tab.click().await?;
|
||||
println!("High importance tab selected");
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
|
||||
for _ in 0..10 {
|
||||
let removed: bool = client
|
||||
.execute(
|
||||
r#"(() => {
|
||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||
if (iframe && iframe.parentNode) {
|
||||
iframe.parentNode.removeChild(iframe);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
})()"#,
|
||||
vec![],
|
||||
)
|
||||
.await?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
if removed { break; }
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||
let script = format!(
|
||||
r#"
|
||||
(() => {{
|
||||
const from = document.querySelector('#dtTeletraderFromDate');
|
||||
const to = document.querySelector('#dtTeletraderEndDate');
|
||||
if (from) {{ from.value = '{}'; from.dispatchEvent(new Event('change', {{bubbles: true}})); }}
|
||||
if (to) {{ to.value = '{}'; to.dispatchEvent(new Event('change', {{bubbles: true}})); }}
|
||||
return true;
|
||||
}})()
|
||||
"#,
|
||||
start, end
|
||||
);
|
||||
client.execute(&script, vec![]).await?;
|
||||
sleep(Duration::from_millis(1200)).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn extract_events(client: &Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
let result = client.execute(EXTRACTION_JS, vec![]).await?;
|
||||
let array = result.as_array().ok_or_else(|| anyhow::anyhow!("Expected array"))?;
|
||||
|
||||
let mut events = Vec::with_capacity(array.len());
|
||||
for val in array {
|
||||
if let Some(obj) = val.as_object() {
|
||||
events.push(EconomicEvent {
|
||||
country: obj["country"].as_str().unwrap_or("").to_string(),
|
||||
date: obj["date"].as_str().unwrap_or("").to_string(),
|
||||
time: obj["time"].as_str().unwrap_or("").to_string(),
|
||||
event: obj["event"].as_str().unwrap_or("").to_string(),
|
||||
actual: obj["actual"].as_str().unwrap_or("").to_string(),
|
||||
forecast: obj["forecast"].as_str().unwrap_or("").to_string(),
|
||||
previous: obj["previous"].as_str().unwrap_or("").to_string(),
|
||||
importance: "High".to_string(),
|
||||
description: obj["description"].as_str().unwrap_or("").to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
println!("Extracted {} high-impact events", events.len());
|
||||
Ok(events)
|
||||
}
|
||||
113
src/economic/storage.rs
Normal file
113
src/economic/storage.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
// src/economic/storage.rs
|
||||
use super::types::*;
|
||||
use super::helpers::*;
|
||||
use tokio::fs;
|
||||
use chrono::{Local, NaiveDate, Datelike};
|
||||
|
||||
pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
let dir = std::path::Path::new("economic_events");
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
if dir.exists() {
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
if name.starts_with("chunk_") {
|
||||
if let Some(content) = fs::read_to_string(&path).await.ok() {
|
||||
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||
let start = name[6..16].to_string();
|
||||
let end = name[17..27].to_string();
|
||||
chunks.push(ChunkInfo { start_date: start, end_date: end, path, event_count: events.len() });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
chunks.sort_by_key(|c| c.start_date.clone());
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMap<String, EconomicEvent>> {
|
||||
let mut map = HashMap::new();
|
||||
for chunk in chunks {
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
for e in events {
|
||||
map.insert(event_key(&e), e);
|
||||
}
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
||||
let dir = std::path::Path::new("economic_events");
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| e.date.clone());
|
||||
|
||||
let mut chunk = Vec::new();
|
||||
let mut start: Option<NaiveDate> = None;
|
||||
for e in sorted {
|
||||
let date = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d")?;
|
||||
if let Some(s) = start {
|
||||
if (date - s).num_days() > 100 || chunk.len() >= 500 {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
chunk.clear();
|
||||
start = Some(date);
|
||||
}
|
||||
} else {
|
||||
start = Some(date);
|
||||
}
|
||||
chunk.push(e);
|
||||
}
|
||||
if !chunk.is_empty() {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::Result<()> {
|
||||
let start = events.iter().map(|e| &e.date).min().unwrap().clone();
|
||||
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
||||
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
||||
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
||||
if changes.is_empty() { return Ok(()); }
|
||||
let dir = std::path::Path::new("economic_event_changes");
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
|
||||
for c in changes {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||
let key = format!("{:02}_{}", d.month(), d.year());
|
||||
by_month.entry(key).or_default().push(c.clone());
|
||||
}
|
||||
}
|
||||
|
||||
for (month, list) in by_month {
|
||||
let path = dir.join(format!("event_changes_{}.json", month));
|
||||
let mut all = if path.exists() {
|
||||
let s = fs::read_to_string(&path).await?;
|
||||
serde_json::from_str(&s).unwrap_or_default()
|
||||
} else { vec![] };
|
||||
all.extend(list);
|
||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn target_end_date() -> String {
|
||||
let now = Local::now().naive_local().date();
|
||||
let future = now + chrono::Duration::days(90);
|
||||
future.format("%Y-%m-%d").to_string()
|
||||
}
|
||||
41
src/economic/types.rs
Normal file
41
src/economic/types.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
// src/economic/types.rs
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct EconomicEvent {
|
||||
pub country: String,
|
||||
pub date: String, // YYYY-MM-DD
|
||||
pub time: String,
|
||||
pub event: String,
|
||||
pub actual: String,
|
||||
pub forecast: String,
|
||||
pub previous: String,
|
||||
pub importance: String,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct EventChange {
|
||||
pub date: String,
|
||||
pub event: String,
|
||||
pub country: String,
|
||||
pub change_type: String, // actual|forecast|time|newly_added|removed
|
||||
pub field_changed: String,
|
||||
pub old_value: String,
|
||||
pub new_value: String,
|
||||
pub detected_at: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ChunkInfo {
|
||||
pub start_date: String,
|
||||
pub end_date: String,
|
||||
pub path: std::path::PathBuf,
|
||||
pub event_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ScrapeResult {
|
||||
pub changes: Vec<EventChange>,
|
||||
pub removed_keys: std::collections::HashSet<String>,
|
||||
}
|
||||
116
src/economic/update.rs
Normal file
116
src/economic/update.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
// src/economic/update.rs
|
||||
use super::{scraper::*, storage::*, helpers::*, types::*};
|
||||
use crate::config::Config;
|
||||
use chrono::{Local, NaiveDate};
|
||||
|
||||
pub async fn run_full_update(client: &fantoccini::Client, config: &Config) -> anyhow::Result<()> {
|
||||
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
||||
let end_date = config.target_end_date();
|
||||
|
||||
let chunks = scan_existing_chunks().await?;
|
||||
let mut events = load_existing_events(&chunks).await?;
|
||||
println!("Loaded {} events from {} chunks", events.len(), chunks.len());
|
||||
|
||||
let start_date = if events.is_empty() {
|
||||
config.economic_start_date.clone()
|
||||
} else if events.values().any(|e| e.date >= today_str) {
|
||||
today_str.clone()
|
||||
} else {
|
||||
events.values()
|
||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
.max()
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or(today_str.clone())
|
||||
};
|
||||
|
||||
println!("Scraping economic events: {} → {}", start_date, end_date);
|
||||
|
||||
let mut current = start_date;
|
||||
let mut total_changes = 0;
|
||||
|
||||
while current <= end_date {
|
||||
set_date_range(client, ¤t, &end_date).await?;
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
||||
|
||||
let new_events = extract_events(client).await?;
|
||||
if new_events.is_empty() { break; }
|
||||
|
||||
let result = process_batch(&new_events, &mut events, &today_str);
|
||||
total_changes += result.changes.len();
|
||||
save_changes(&result.changes).await?;
|
||||
|
||||
let next = new_events.iter()
|
||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
.max()
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or(end_date.clone());
|
||||
|
||||
if next > end_date { break; }
|
||||
current = next;
|
||||
}
|
||||
|
||||
save_optimized_chunks(events).await?;
|
||||
println!("Economic update complete — {} changes detected", total_changes);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn process_batch(
|
||||
new_events: &[EconomicEvent],
|
||||
existing: &mut std::collections::HashMap<String, EconomicEvent>,
|
||||
today: &str,
|
||||
) -> ScrapeResult {
|
||||
let mut changes = Vec::new();
|
||||
let mut removed = std::collections::HashSet::new();
|
||||
|
||||
let identity_map = build_identity_lookup(existing);
|
||||
let date_map = build_date_event_lookup(existing);
|
||||
|
||||
for new in new_events {
|
||||
let key = event_key(new);
|
||||
|
||||
if let Some(old) = existing.get(&key) {
|
||||
changes.extend(detect_changes(old, new, today));
|
||||
existing.insert(key, new.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
let date_key = format!("{}|{}|{}", new.country, new.event, new.date);
|
||||
if let Some(occurrences) = date_map.get(&date_key) {
|
||||
if let Some((old_key, old_event)) = occurrences.iter().find(|(k, _)| *k != key) {
|
||||
if new.date.as_str() > today {
|
||||
changes.push(EventChange {
|
||||
date: new.date.clone(),
|
||||
event: new.event.clone(),
|
||||
country: new.country.clone(),
|
||||
change_type: "time".to_string(),
|
||||
field_changed: "time".to_string(),
|
||||
old_value: old_event.time.clone(),
|
||||
new_value: new.time.clone(),
|
||||
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||
});
|
||||
}
|
||||
removed.insert(old_key.clone());
|
||||
existing.remove(old_key);
|
||||
}
|
||||
}
|
||||
|
||||
if new.date.as_str() > today {
|
||||
changes.push(EventChange {
|
||||
date: new.date.clone(),
|
||||
event: new.event.clone(),
|
||||
country: new.country.clone(),
|
||||
change_type: "newly_added".to_string(),
|
||||
field_changed: "new_event".to_string(),
|
||||
old_value: "".to_string(),
|
||||
new_value: format!("{} @ {}", new.date, new.time),
|
||||
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
existing.insert(key, new.clone());
|
||||
}
|
||||
|
||||
ScrapeResult { changes, removed_keys: removed }
|
||||
}
|
||||
Reference in New Issue
Block a user