Compare commits
29 Commits
645f7a546b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 2fe06a9d88 | |||
| 9db4320c40 | |||
| eeae94e041 | |||
| e57a013224 | |||
| bbc19f2110 | |||
| 9cfcae84ea | |||
| 7b680f960f | |||
| 462f7ca672 | |||
| cd3f47d91f | |||
| fc56ae5d82 | |||
| 9d0d15f3f8 | |||
| 0ea3fcc3b5 | |||
| 71df92965f | |||
| 0ca53bf585 | |||
| 4dec97ef63 | |||
| 32ae002fc9 | |||
| c56fcfdd72 | |||
| 0af0c1e615 | |||
| a44e22df0b | |||
| b8c98163da | |||
| 6302c8749a | |||
| 3df871f69f | |||
| 67ecc1e89a | |||
| a91447cace | |||
| d6e244c8d8 | |||
| 0853124918 | |||
| 59aad09f71 | |||
| 2604caab0e | |||
| e6729b06b8 |
18
.gitignore
vendored
18
.gitignore
vendored
@@ -16,3 +16,21 @@ target/
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# env
|
||||
.env
|
||||
|
||||
|
||||
# Added by cargo
|
||||
|
||||
/target
|
||||
|
||||
# /chromedriver-win64/*
|
||||
|
||||
# data folders
|
||||
/economic_events*
|
||||
/economic_event_changes*
|
||||
/corporate_events*
|
||||
/corporate_prices*
|
||||
/corporate_event_changes*
|
||||
/data*
|
||||
3961
Cargo.lock
generated
Normal file
3961
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
50
Cargo.toml
Normal file
50
Cargo.toml
Normal file
@@ -0,0 +1,50 @@
|
||||
[package]
|
||||
name = "event_backtest_engine"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
authors = ["Your Name <you@example.com>"]
|
||||
description = "High-impact economic & corporate earnings data collector for short-event backtesting (overnight/weekend gaps)"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/yourname/event_backtest_engine"
|
||||
keywords = ["finance", "earnings", "economic-calendar", "backtesting", "quant"]
|
||||
categories = ["finance", "data-structures", "asynchronous"]
|
||||
|
||||
# ===================================================================
|
||||
# Dependencies
|
||||
# ===================================================================
|
||||
[dependencies]
|
||||
# Async runtime
|
||||
tokio = { version = "1.38", features = ["full"] }
|
||||
|
||||
# Web scraping & HTTP
|
||||
reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking"] }
|
||||
scraper = "0.19" # HTML parsing for Yahoo earnings pages
|
||||
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
|
||||
yfinance-rs = "0.7.2"
|
||||
|
||||
# Serialization
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
csv = "1.3"
|
||||
zip = "6.0.0"
|
||||
flate2 = "1.1.5"
|
||||
|
||||
# Generating
|
||||
rand = "0.9.2"
|
||||
|
||||
# Environment handling
|
||||
dotenvy = "0.15"
|
||||
|
||||
# Date & time
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
|
||||
# Error handling
|
||||
anyhow = "1.0"
|
||||
|
||||
# Logging (optional but recommended)
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||
|
||||
# Parallel processing (for batch tickers)
|
||||
futures = "0.3"
|
||||
rayon = "1.10" # optional: for parallel price downloads
|
||||
251
README.md
251
README.md
@@ -1,2 +1,251 @@
|
||||
# WebScraper
|
||||
# WebScraper — Wirtschaftskalender Datenextraktion
|
||||
|
||||
Ein leistungsstarker Web-Scraper in **Rust**, der hochwichtige Wirtschaftsereignisse von **finanzen.net** extrahiert und analysiert.
|
||||
|
||||
---
|
||||
|
||||
## 📋 Projektübersicht
|
||||
|
||||
Dieses Tool automatisiert die Extraktion von Wirtschaftsdaten aus dem Finanzen.net Wirtschaftskalender, mit besonderem Fokus auf hochwichtige Ereignisse (3 gelbe Sterne). Die extrahierten Daten werden in strukturiertem JSON-Format gespeichert und umfassen umfangreiche Metadaten für weitere Analysen.
|
||||
|
||||
---
|
||||
|
||||
## ✨ Hauptfunktionen
|
||||
|
||||
* **Selektive Extraktion:** Fokussiert ausschließlich auf hochwichtige Wirtschaftsereignisse (3 gelbe Sterne).
|
||||
* **Intelligentes Chunking:** Automatische Aufteilung großer Datumsbereiche in handhabbare Blöcke.
|
||||
* **Robuste Datumsverarbeitung:** Unterstützung für deutsche und internationale Datumsformate.
|
||||
* **Datenkonsistenzprüfung:** Umfassende Validierung der extrahierten Daten.
|
||||
* **Duplikaterkennung:** Automatische Erkennung und Entfernung doppelter Einträge.
|
||||
* **Graceful Shutdown:** Elegante Behandlung von Abbruchsignalen (Ctrl+C).
|
||||
* **Echtzeit-Export:** Parallele Speicherung von Zwischen- und Endergebnissen.
|
||||
|
||||
---
|
||||
|
||||
## 🛠 Technischer Stack
|
||||
|
||||
* **Programmiersprache:** Rust
|
||||
* **Web Automation:** Fantoccini (WebDriver Client)
|
||||
* **Datum/Zeit:** Chrono
|
||||
* **JSON-Verarbeitung:** Serde, serde_json
|
||||
* **Asynchrone Verarbeitung:** Tokio
|
||||
* **Browser-Treiber:** ChromeDriver
|
||||
|
||||
---
|
||||
|
||||
## 📁 Projektstruktur
|
||||
|
||||
```
|
||||
WebScraper/
|
||||
├── src/
|
||||
│ └── main.rs # Hauptanwendungslogik
|
||||
├── chromedriver-win64/ # ChromeDriver Binary
|
||||
├── Cargo.toml # Rust Abhängigkeiten
|
||||
├──
|
||||
├──
|
||||
├── Cargo.lock # Versionssperren
|
||||
├── countries.json # Länderreferenzdaten
|
||||
├── continents.json # Kontinentreferenzdaten
|
||||
└── README.md # Diese Datei
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Datenmodell
|
||||
|
||||
Extrahiert werden `EconomicEvent`-Strukturen mit folgenden Feldern:
|
||||
|
||||
```rust
|
||||
struct EconomicEvent {
|
||||
country: String, // Herkunftsland
|
||||
date: String, // Datum (ISO-Format)
|
||||
time: String, // Uhrzeit
|
||||
event: String, // Ereignisname
|
||||
actual: String, // Tatsächlicher Wert
|
||||
forecast: String, // Prognosewert
|
||||
previous: String, // Vorheriger Wert
|
||||
importance: String, // Wichtigkeit (z. B. "High")
|
||||
description: String // Beschreibung
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Installation & Einrichtung
|
||||
|
||||
### Voraussetzungen
|
||||
|
||||
* **Rust Toolchain** installieren:
|
||||
|
||||
```bash
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs/ | sh
|
||||
```
|
||||
|
||||
* **ChromeDriver** herunterladen:
|
||||
|
||||
* Webseite: `https://chromedriver.storage.googleapis.com/index.html`
|
||||
* Oder: `https://googlechromelabs.github.io/chrome-for-testing/`
|
||||
* Entpacke in `chromedriver-win64/` Verzeichnis
|
||||
|
||||
* **Chrome Browser** muss installiert sein.
|
||||
|
||||
### Build & Ausführung
|
||||
|
||||
```bash
|
||||
# Projekt klonen/erstellen
|
||||
git clone <repository-url>
|
||||
cd WebScraper
|
||||
|
||||
# Abhängigkeiten herunterladen
|
||||
cargo fetch
|
||||
|
||||
# Projekt kompilieren und ausführen
|
||||
cargo run --release
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ Konfiguration
|
||||
|
||||
### Datumsbereich
|
||||
|
||||
Standardmäßig extrahiert der Scraper Daten zwischen konfigurierbaren Grenzen. Beispiel-Aufruf in `main()`:
|
||||
|
||||
```rust
|
||||
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
||||
```
|
||||
|
||||
### Chrome-Optionen
|
||||
|
||||
Chrome-Verhalten kann in den Capabilities angepasst werden, z. B.:
|
||||
|
||||
```json
|
||||
"args": [
|
||||
"--disable-gpu",
|
||||
"--disable-notifications",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-blink-features=AutomationControlled"
|
||||
]
|
||||
```
|
||||
|
||||
> Hinweis: Für Headless- oder Headful-Ausführung kann das `--headless`-Flag je nach Use Case hinzugefügt oder entfernt werden.
|
||||
|
||||
---
|
||||
|
||||
## 📈 Ausführungsablauf
|
||||
|
||||
1. **Initialisierung:** ChromeDriver starten, Browser-Session aufbauen
|
||||
2. **Navigation:** Zielseite (`https://www.finanzen.net/termine/wirtschaftsdaten/`) aufrufen
|
||||
3. **Overlay-Handling:** Störende Elemente (Cookie/Consent) entfernen oder umgehen
|
||||
4. **Tab-Auswahl:** Hochwichtige Ereignisse (3 Sterne) auswählen
|
||||
5. **Chunked Extraction:**
|
||||
|
||||
* Datumsbereich in Blöcke aufteilen
|
||||
* JavaScript-basierte Datenextraktion
|
||||
* Automatische Paginierung / "Load more"
|
||||
6. **Datenvalidierung:** Konsistenz- und Qualitätsprüfungen
|
||||
7. **Export:** JSON-Dateien mit Zeitstempel generieren
|
||||
|
||||
---
|
||||
|
||||
## 🔍 Datenqualitätsprüfungen
|
||||
|
||||
Der Scraper führt folgende Prüfungen durch:
|
||||
|
||||
* **Duplikaterkennung:** Identische Events werden entfernt
|
||||
* **Zeitformat-Validierung:** Korrekte `HH:MM` Formatierung
|
||||
* **Datumsbereichsprüfung:** Extrahierte Events liegen im Zielzeitraum
|
||||
* **Vollständigkeitscheck:** Kritische Felder müssen vorhanden sein
|
||||
* **Beschreibungsabdeckung:** Prüft, ob Beschreibungen für Events vorhanden sind
|
||||
* **Länder-/Monatsverteilung:** Statistische Auswertung
|
||||
|
||||
---
|
||||
|
||||
## 📤 Ausgabeformate
|
||||
|
||||
**Hauptexport**
|
||||
|
||||
* `economic_events_YYYYMMDD_HHMMSS_combined.json` — Vollständiger Datensatz
|
||||
|
||||
**Chunk-Exporte**
|
||||
|
||||
* `economic_events_YYYYMMDD_HHMMSS_chunk_X.json` — Zwischenstände pro Block
|
||||
|
||||
### Beispiel-Eintrag (JSON)
|
||||
|
||||
```json
|
||||
{
|
||||
"country": "USA",
|
||||
"date": "2024-01-15",
|
||||
"time": "14:30",
|
||||
"event": "Verbraucherpreisindex (CPI)",
|
||||
"actual": "3.4%",
|
||||
"forecast": "3.2%",
|
||||
"previous": "3.1%",
|
||||
"importance": "High",
|
||||
"description": "Monatliche Inflationsdaten für die USA"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🛡️ Fehlerbehandlung
|
||||
|
||||
* **Automatische Wiederholung:** Bei fehlgeschlagenen Extraktionen
|
||||
* **Graceful Degradation:** Fallback-Logiken für Datumsparsing
|
||||
* **Timeout-Management:** Angemessene Wartezeiten zwischen Interaktionen
|
||||
* **Ressourcenbereinigung:** Korrektes Schließen von Browser und Treiber
|
||||
|
||||
---
|
||||
|
||||
## 📊 Leistungsmerkmale
|
||||
|
||||
* **Parallelverarbeitung:** Asynchrone Operationen mit Tokio
|
||||
* **Speichereffizienz:** Chunk-basierte Verarbeitung großer Datensätze
|
||||
* **Netzwerkoptimierung:** Intelligente Delays zwischen Requests
|
||||
* **Robustheit:** Widerstandsfähig gegen Seitenänderungen
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Entwicklung
|
||||
|
||||
**Abhängigkeiten hinzufügen**
|
||||
|
||||
```bash
|
||||
cargo add <crate-name>
|
||||
```
|
||||
|
||||
**Debug-Modus**
|
||||
|
||||
```bash
|
||||
cargo run
|
||||
```
|
||||
|
||||
**Release-Build**
|
||||
|
||||
```bash
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
**Tests ausführen**
|
||||
|
||||
```bash
|
||||
cargo test
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🌐 Länderabdeckung
|
||||
|
||||
Der Scraper unterstützt 52 Länder und Regionen (siehe `countries.json`), darunter:
|
||||
|
||||
* USA, China, Deutschland, Japan, UK
|
||||
* Eurozone, Schweiz, Kanada, Australien
|
||||
* und viele weitere wichtige Volkswirtschaften
|
||||
|
||||
---
|
||||
|
||||
## chromedriver Download
|
||||
|
||||
https://chromedriver.storage.googleapis.com/index.html
|
||||
https://googlechromelabs.github.io/chrome-for-testing/
|
||||
27
chromedriver-win64/LICENSE.chromedriver
Normal file
27
chromedriver-win64/LICENSE.chromedriver
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright 2015 The Chromium Authors
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google LLC nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
14682
chromedriver-win64/THIRD_PARTY_NOTICES.chromedriver
Normal file
14682
chromedriver-win64/THIRD_PARTY_NOTICES.chromedriver
Normal file
File diff suppressed because it is too large
Load Diff
BIN
chromedriver-win64/chromedriver.exe
Normal file
BIN
chromedriver-win64/chromedriver.exe
Normal file
Binary file not shown.
46
fx_rates.json
Normal file
46
fx_rates.json
Normal file
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"CHF": [
|
||||
0.808996035919424,
|
||||
"2025-11-25"
|
||||
],
|
||||
"JPY": [
|
||||
0.0064,
|
||||
"2025-11-25"
|
||||
],
|
||||
"INR": [
|
||||
89.28571428571429,
|
||||
"2025-11-25"
|
||||
],
|
||||
"GBp": [
|
||||
0.7603406326034063,
|
||||
"2025-11-25"
|
||||
],
|
||||
"AUD": [
|
||||
1.5463120457708364,
|
||||
"2025-11-25"
|
||||
],
|
||||
"SAR": [
|
||||
3.750937734433609,
|
||||
"2025-11-25"
|
||||
],
|
||||
"TWD": [
|
||||
31.446540880503143,
|
||||
"2025-11-25"
|
||||
],
|
||||
"CNY": [
|
||||
7.087172218284904,
|
||||
"2025-11-25"
|
||||
],
|
||||
"HKD": [
|
||||
7.776049766718508,
|
||||
"2025-11-25"
|
||||
],
|
||||
"CAD": [
|
||||
1.4110342881332016,
|
||||
"2025-11-25"
|
||||
],
|
||||
"EUR": [
|
||||
0.8649022660439372,
|
||||
"2025-11-25"
|
||||
]
|
||||
}
|
||||
30
src/config.rs
Normal file
30
src/config.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
// src/config.rs
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Config {
|
||||
// Economic calendar start (usually the earliest available on finanzen.net)
|
||||
pub economic_start_date: String, // e.g. "2007-02-13"
|
||||
|
||||
// Corporate earnings & price history start
|
||||
pub corporate_start_date: String, // e.g. "2000-01-01" or "2010-01-01"
|
||||
|
||||
// How far into the future we scrape economic events
|
||||
pub economic_lookahead_months: u32, // default: 3
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
economic_start_date: "2007-02-13".to_string(),
|
||||
corporate_start_date: "2010-01-01".to_string(),
|
||||
economic_lookahead_months: 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
pub fn target_end_date(&self) -> String {
|
||||
let now = chrono::Local::now().naive_local().date();
|
||||
let future = now + chrono::Duration::days(30 * self.economic_lookahead_months as i64);
|
||||
future.format("%Y-%m-%d").to_string()
|
||||
}
|
||||
}
|
||||
194
src/corporate/aggregation.rs
Normal file
194
src/corporate/aggregation.rs
Normal file
@@ -0,0 +1,194 @@
|
||||
// src/corporate/aggregation.rs
|
||||
use super::types::CompanyPrice;
|
||||
use super::storage::*;
|
||||
use tokio::fs;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct DayData {
|
||||
sources: Vec<(CompanyPrice, String)>, // (price, source_ticker)
|
||||
total_volume: u64,
|
||||
vwap: f64,
|
||||
open: f64,
|
||||
high: f64,
|
||||
low: f64,
|
||||
close: f64,
|
||||
}
|
||||
|
||||
/// Aggregate price data from multiple exchanges, converting all to USD
|
||||
pub async fn aggregate_best_price_data(lei: &str) -> anyhow::Result<()> {
|
||||
let company_dir = get_company_dir(lei);
|
||||
|
||||
for timeframe in ["daily", "5min"].iter() {
|
||||
let source_dir = company_dir.join(timeframe);
|
||||
if !source_dir.exists() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut all_prices: Vec<(CompanyPrice, String)> = Vec::new();
|
||||
let mut by_date_time: HashMap<String, DayData> = HashMap::new();
|
||||
|
||||
// Load all sources with their ticker names
|
||||
let mut entries = tokio::fs::read_dir(&source_dir).await?;
|
||||
let mut source_count = 0;
|
||||
let mut sources_used = std::collections::HashSet::new();
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let source_dir_path = entry.path();
|
||||
if !source_dir_path.is_dir() { continue; }
|
||||
|
||||
let source_ticker = source_dir_path
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
|
||||
let prices_path = source_dir_path.join("prices.json");
|
||||
if !prices_path.exists() { continue; }
|
||||
|
||||
let content = tokio::fs::read_to_string(&prices_path).await?;
|
||||
let mut prices: Vec<CompanyPrice> = serde_json::from_str(&content)?;
|
||||
|
||||
if !prices.is_empty() {
|
||||
sources_used.insert(source_ticker.clone());
|
||||
source_count += 1;
|
||||
}
|
||||
|
||||
for price in prices {
|
||||
all_prices.push((price, source_ticker.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
if all_prices.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
println!(" Aggregating from {} exchanges: {}",
|
||||
sources_used.len(),
|
||||
sources_used.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
);
|
||||
|
||||
// Group by date + time (for 5min) or just date
|
||||
for (p, source) in all_prices {
|
||||
let key = if timeframe == &"5min" && !p.time.is_empty() {
|
||||
format!("{}_{}", p.date, p.time)
|
||||
} else {
|
||||
p.date.clone()
|
||||
};
|
||||
|
||||
// Convert to USD immediately
|
||||
let usd_rate = super::fx::get_usd_rate(&p.currency).await.unwrap_or(1.0);
|
||||
|
||||
let mut p_usd = p.clone();
|
||||
p_usd.open *= usd_rate;
|
||||
p_usd.high *= usd_rate;
|
||||
p_usd.low *= usd_rate;
|
||||
p_usd.close *= usd_rate;
|
||||
p_usd.adj_close *= usd_rate;
|
||||
p_usd.currency = "USD".to_string();
|
||||
|
||||
let entry = by_date_time.entry(key.clone()).or_insert(DayData {
|
||||
sources: vec![],
|
||||
total_volume: 0,
|
||||
vwap: 0.0,
|
||||
open: p_usd.open,
|
||||
high: p_usd.high,
|
||||
low: p_usd.low,
|
||||
close: p_usd.close,
|
||||
});
|
||||
|
||||
let volume = p.volume.max(1); // avoid div0
|
||||
let vwap_contrib = p_usd.close * volume as f64;
|
||||
|
||||
entry.sources.push((p_usd.clone(), source));
|
||||
entry.total_volume += volume;
|
||||
entry.vwap += vwap_contrib;
|
||||
|
||||
// Use first open, last close, max high, min low
|
||||
if entry.sources.len() == 1 {
|
||||
entry.open = p_usd.open;
|
||||
}
|
||||
entry.close = p_usd.close;
|
||||
entry.high = entry.high.max(p_usd.high);
|
||||
entry.low = entry.low.min(p_usd.low);
|
||||
}
|
||||
|
||||
// Finalize aggregated data
|
||||
let mut aggregated: Vec<CompanyPrice> = Vec::new();
|
||||
|
||||
for (key, data) in by_date_time {
|
||||
let vwap = data.vwap / data.total_volume as f64;
|
||||
|
||||
let (date, time) = if key.contains('_') {
|
||||
let parts: Vec<&str> = key.split('_').collect();
|
||||
(parts[0].to_string(), parts[1].to_string())
|
||||
} else {
|
||||
(key, "".to_string())
|
||||
};
|
||||
|
||||
// Track which exchange contributed most volume
|
||||
let best_source = data.sources.iter()
|
||||
.max_by_key(|(p, _)| p.volume)
|
||||
.map(|(_, src)| src.clone())
|
||||
.unwrap_or_else(|| "unknown".to_string());
|
||||
|
||||
aggregated.push(CompanyPrice {
|
||||
ticker: format!("{lei}@agg"), // Mark as aggregated
|
||||
date,
|
||||
time,
|
||||
open: data.open,
|
||||
high: data.high,
|
||||
low: data.low,
|
||||
close: data.close,
|
||||
adj_close: vwap,
|
||||
volume: data.total_volume,
|
||||
currency: "USD".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
aggregated.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
||||
|
||||
// Save aggregated result
|
||||
let agg_dir = company_dir.join("aggregated").join(timeframe);
|
||||
fs::create_dir_all(&agg_dir).await?;
|
||||
let path = agg_dir.join("prices.json");
|
||||
fs::write(&path, serde_json::to_string_pretty(&aggregated)?).await?;
|
||||
|
||||
// Save aggregation metadata
|
||||
let meta = AggregationMetadata {
|
||||
lei: lei.to_string(), // ← CHANGE THIS
|
||||
timeframe: timeframe.to_string(),
|
||||
sources: sources_used.into_iter().collect(),
|
||||
total_bars: aggregated.len(),
|
||||
date_range: (
|
||||
aggregated.first().map(|p| p.date.clone()).unwrap_or_default(),
|
||||
aggregated.last().map(|p| p.date.clone()).unwrap_or_default(),
|
||||
),
|
||||
aggregated_at: chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||
};
|
||||
|
||||
let meta_path = agg_dir.join("metadata.json");
|
||||
fs::write(&meta_path, serde_json::to_string_pretty(&meta)?).await?;
|
||||
|
||||
println!(" ✓ {} {} bars from {} sources (USD)",
|
||||
aggregated.len(),
|
||||
timeframe,
|
||||
source_count
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
||||
struct AggregationMetadata {
|
||||
lei: String,
|
||||
timeframe: String,
|
||||
sources: Vec<String>,
|
||||
total_bars: usize,
|
||||
date_range: (String, String),
|
||||
aggregated_at: String,
|
||||
}
|
||||
51
src/corporate/fx.rs
Normal file
51
src/corporate/fx.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
// src/corporate/fx.rs
|
||||
use std::collections::HashMap;
|
||||
use reqwest;
|
||||
use serde_json::Value;
|
||||
use tokio::fs;
|
||||
use std::path::Path;
|
||||
|
||||
static FX_CACHE_PATH: &str = "fx_rates.json";
|
||||
|
||||
pub async fn get_usd_rate(currency: &str) -> anyhow::Result<f64> {
|
||||
if currency == "USD" {
|
||||
return Ok(1.0);
|
||||
}
|
||||
|
||||
let mut cache: HashMap<String, (f64, String)> = if Path::new(FX_CACHE_PATH).exists() {
|
||||
let content = fs::read_to_string(FX_CACHE_PATH).await?;
|
||||
serde_json::from_str(&content).unwrap_or_default()
|
||||
} else {
|
||||
HashMap::new()
|
||||
};
|
||||
|
||||
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
|
||||
if let Some((rate, date)) = cache.get(currency) {
|
||||
if date == &today {
|
||||
return Ok(*rate);
|
||||
}
|
||||
}
|
||||
|
||||
let symbol = format!("{}USD=X", currency);
|
||||
let url = format!("https://query1.finance.yahoo.com/v8/finance/chart/{}?range=1d&interval=1d", symbol);
|
||||
|
||||
let json: Value = reqwest::Client::new()
|
||||
.get(&url)
|
||||
.header("User-Agent", "Mozilla/5.0")
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
|
||||
let close = json["chart"]["result"][0]["meta"]["regularMarketPrice"]
|
||||
.as_f64()
|
||||
.or_else(|| json["chart"]["result"][0]["indicators"]["quote"][0]["close"][0].as_f64())
|
||||
.unwrap_or(1.0);
|
||||
|
||||
let rate = if currency == "JPY" || currency == "KRW" { close } else { 1.0 / close }; // inverse pairs
|
||||
|
||||
cache.insert(currency.to_string(), (rate, today.clone()));
|
||||
let _ = fs::write(FX_CACHE_PATH, serde_json::to_string_pretty(&cache)?).await;
|
||||
|
||||
Ok(rate)
|
||||
}
|
||||
70
src/corporate/helpers.rs
Normal file
70
src/corporate/helpers.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
// src/corporate/helpers.rs
|
||||
use super::types::*;
|
||||
use chrono::{Local, NaiveDate};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
pub fn event_key(e: &CompanyEvent) -> String {
|
||||
format!("{}|{}|{}", e.ticker, e.date, e.time)
|
||||
}
|
||||
|
||||
pub fn detect_changes(old: &CompanyEvent, new: &CompanyEvent, today: &str) -> Vec<CompanyEventChange> {
|
||||
let mut changes = Vec::new();
|
||||
let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
|
||||
|
||||
if new.date.as_str() <= today { return changes; }
|
||||
|
||||
if old.time != new.time {
|
||||
changes.push(CompanyEventChange {
|
||||
ticker: new.ticker.clone(),
|
||||
date: new.date.clone(),
|
||||
field_changed: "time".to_string(),
|
||||
old_value: old.time.clone(),
|
||||
new_value: new.time.clone(),
|
||||
detected_at: ts.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
if old.eps_forecast != new.eps_forecast {
|
||||
changes.push(CompanyEventChange {
|
||||
ticker: new.ticker.clone(),
|
||||
date: new.date.clone(),
|
||||
field_changed: "eps_forecast".to_string(),
|
||||
old_value: format!("{:?}", old.eps_forecast),
|
||||
new_value: format!("{:?}", new.eps_forecast),
|
||||
detected_at: ts.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
if old.eps_actual != new.eps_actual {
|
||||
changes.push(CompanyEventChange {
|
||||
ticker: new.ticker.clone(),
|
||||
date: new.date.clone(),
|
||||
field_changed: "eps_actual".to_string(),
|
||||
old_value: format!("{:?}", old.eps_actual),
|
||||
new_value: format!("{:?}", new.eps_actual),
|
||||
detected_at: ts.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
// Add similar for revenue if applicable
|
||||
|
||||
changes
|
||||
}
|
||||
|
||||
pub fn price_key(p: &CompanyPrice) -> String {
|
||||
if p.time.is_empty() {
|
||||
format!("{}|{}", p.ticker, p.date)
|
||||
} else {
|
||||
format!("{}|{}|{}", p.ticker, p.date, p.time)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_float(s: &str) -> Option<f64> {
|
||||
s.replace("--", "").replace(",", "").parse::<f64>().ok()
|
||||
}
|
||||
|
||||
pub fn parse_yahoo_date(s: &str) -> anyhow::Result<NaiveDate> {
|
||||
NaiveDate::parse_from_str(s, "%B %d, %Y")
|
||||
.or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y"))
|
||||
.map_err(|_| anyhow::anyhow!("Bad date: {s}"))
|
||||
}
|
||||
12
src/corporate/mod.rs
Normal file
12
src/corporate/mod.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
// src/corporate/mod.rs
|
||||
pub mod types;
|
||||
pub mod scraper;
|
||||
pub mod storage;
|
||||
pub mod update;
|
||||
pub mod helpers;
|
||||
pub mod aggregation;
|
||||
pub mod fx;
|
||||
pub mod openfigi;
|
||||
|
||||
pub use types::*;
|
||||
pub use update::run_full_update;
|
||||
172
src/corporate/openfigi.rs
Normal file
172
src/corporate/openfigi.rs
Normal file
@@ -0,0 +1,172 @@
|
||||
// src/corporate/openfigi.rs
|
||||
use super::{types::*};
|
||||
use reqwest::Client as HttpClient;
|
||||
use reqwest::header::{HeaderMap, HeaderValue};
|
||||
use serde_json::{json, Value};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use tokio::time::{sleep, Duration};
|
||||
use anyhow::Context;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct OpenFigiClient {
|
||||
client: HttpClient,
|
||||
api_key: Option<String>,
|
||||
has_key: bool,
|
||||
}
|
||||
|
||||
impl OpenFigiClient {
|
||||
pub fn new() -> anyhow::Result<Self> {
|
||||
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
|
||||
let has_key = api_key.is_some();
|
||||
|
||||
let mut builder = HttpClient::builder()
|
||||
.user_agent("Mozilla/5.0 (compatible; OpenFIGI-Rust/1.0)")
|
||||
.timeout(Duration::from_secs(30));
|
||||
|
||||
if let Some(key) = &api_key {
|
||||
let mut headers = HeaderMap::new();
|
||||
headers.insert("X-OPENFIGI-APIKEY", HeaderValue::from_str(key)?);
|
||||
builder = builder.default_headers(headers);
|
||||
}
|
||||
|
||||
let client = builder.build().context("Failed to build HTTP client")?;
|
||||
|
||||
println!(
|
||||
"OpenFIGI client initialized: {}",
|
||||
if has_key { "with API key" } else { "no key (limited mode)" }
|
||||
);
|
||||
|
||||
Ok(Self { client, api_key, has_key })
|
||||
}
|
||||
|
||||
/// Batch-map ISINs to FIGI, filtering equities only
|
||||
pub async fn map_isins_to_figi(&self, isins: &[String]) -> anyhow::Result<Vec<String>> {
|
||||
if isins.is_empty() { return Ok(vec![]); }
|
||||
|
||||
let mut all_figis = Vec::new();
|
||||
let chunk_size = if self.has_key { 100 } else { 5 };
|
||||
|
||||
for chunk in isins.chunks(chunk_size) {
|
||||
let jobs: Vec<Value> = chunk.iter()
|
||||
.map(|isin| json!({
|
||||
"idType": "ID_ISIN",
|
||||
"idValue": isin,
|
||||
"marketSecDes": "Equity", // Pre-filter to equities
|
||||
}))
|
||||
.collect();
|
||||
|
||||
let resp = self.client
|
||||
.post("https://api.openfigi.com/v3/mapping")
|
||||
.header("Content-Type", "application/json")
|
||||
.json(&jobs)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let status = resp.status();
|
||||
let headers = resp.headers().clone();
|
||||
let body = resp.text().await.unwrap_or_default();
|
||||
|
||||
if status.is_client_error() || status.is_server_error() {
|
||||
if status == 401 {
|
||||
return Err(anyhow::anyhow!("Invalid OpenFIGI API key: {}", body));
|
||||
} else if status == 413 {
|
||||
return Err(anyhow::anyhow!("Payload too large—reduce chunk size: {}", body));
|
||||
} else if status == 429 {
|
||||
let reset = headers
|
||||
.get("ratelimit-reset")
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("10")
|
||||
.parse::<u64>()
|
||||
.unwrap_or(10);
|
||||
|
||||
println!("Rate limited—backing off {}s", reset);
|
||||
sleep(Duration::from_secs(reset.max(10))).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
return Err(anyhow::anyhow!("OpenFIGI error {}: {}", status, body));
|
||||
}
|
||||
|
||||
// JSON aus dem *Body-String* parsen
|
||||
let results: Vec<Value> = serde_json::from_str(&body)?;
|
||||
for (job, result) in chunk.iter().zip(results) {
|
||||
if let Some(data) = result["data"].as_array() {
|
||||
for item in data {
|
||||
let sec_type = item["securityType"].as_str().unwrap_or("");
|
||||
let market_sec = item["marketSector"].as_str().unwrap_or("");
|
||||
if market_sec == "Equity" &&
|
||||
(sec_type.contains("Stock") || sec_type.contains("Share") || sec_type.contains("Equity") ||
|
||||
sec_type.contains("Common") || sec_type.contains("Preferred") || sec_type == "ADR" || sec_type == "GDR") {
|
||||
if let Some(figi) = item["figi"].as_str() {
|
||||
all_figis.push(figi.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limit respect: 6s between requests with key
|
||||
if self.has_key {
|
||||
sleep(Duration::from_secs(6)).await;
|
||||
} else {
|
||||
sleep(Duration::from_millis(500)).await; // Slower without key
|
||||
}
|
||||
}
|
||||
|
||||
all_figis.dedup(); // Unique FIGIs per LEI
|
||||
Ok(all_figis)
|
||||
}
|
||||
}
|
||||
|
||||
/// Build FIGI → LEI map from CSV, filtering equities via OpenFIGI
|
||||
pub async fn build_figi_to_lei_map(lei_to_isins: &HashMap<String, Vec<String>>) -> anyhow::Result<HashMap<String, String>> {
|
||||
let client = OpenFigiClient::new()?;
|
||||
if !client.has_key {
|
||||
println!("No API key—skipping FIGI mapping (using empty map)");
|
||||
return Ok(HashMap::new());
|
||||
}
|
||||
|
||||
let mut figi_to_lei: HashMap<String, String> = HashMap::new();
|
||||
let mut processed = 0;
|
||||
|
||||
for (lei, isins) in lei_to_isins {
|
||||
let unique_isins: Vec<_> = isins.iter().cloned().collect::<HashSet<_>>().into_iter().collect();
|
||||
let equity_figis = client.map_isins_to_figi(&unique_isins).await?;
|
||||
|
||||
for figi in equity_figis {
|
||||
figi_to_lei.insert(figi, lei.clone());
|
||||
}
|
||||
|
||||
processed += 1;
|
||||
if processed % 100 == 0 {
|
||||
println!("Processed {} LEIs → {} total equity FIGIs", processed, figi_to_lei.len());
|
||||
}
|
||||
|
||||
// Throttle per-LEI (heavy LEIs have 100s of ISINs)
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
|
||||
// Save full map
|
||||
let data_dir = std::path::Path::new("data");
|
||||
tokio::fs::create_dir_all(data_dir).await?;
|
||||
tokio::fs::write("data/figi_to_lei.json", serde_json::to_string_pretty(&figi_to_lei)?).await?;
|
||||
|
||||
println!("Built FIGI→LEI map: {} mappings (equity-only)", figi_to_lei.len());
|
||||
Ok(figi_to_lei)
|
||||
}
|
||||
|
||||
/// Load/build companies using FIGI as key (enriched with LEI via map)
|
||||
pub async fn load_or_build_companies_figi(
|
||||
lei_to_isins: &HashMap<String, Vec<String>>,
|
||||
figi_to_lei: &HashMap<String, String>,
|
||||
) -> anyhow::Result<Vec<CompanyMetadata>> {
|
||||
let data_dir = std::path::Path::new("data/companies_by_figi");
|
||||
tokio::fs::create_dir_all(data_dir).await?;
|
||||
|
||||
let mut companies = Vec::new();
|
||||
|
||||
|
||||
|
||||
println!("Built {} FIGI-keyed companies.", companies.len());
|
||||
Ok(companies)
|
||||
}
|
||||
691
src/corporate/scraper.rs
Normal file
691
src/corporate/scraper.rs
Normal file
@@ -0,0 +1,691 @@
|
||||
// src/corporate/scraper.rs
|
||||
use super::{types::*, helpers::*};
|
||||
use csv::ReaderBuilder;
|
||||
use fantoccini::{Client, Locator};
|
||||
use scraper::{Html, Selector};
|
||||
use chrono::{DateTime, Duration, NaiveDate, Timelike, Utc};
|
||||
use tokio::{time::{Duration as TokioDuration, sleep}};
|
||||
use reqwest::Client as HttpClient;
|
||||
use serde_json::Value;
|
||||
use zip::ZipArchive;
|
||||
use std::fs::File;
|
||||
use std::{collections::HashMap};
|
||||
use std::io::{Read, BufReader};
|
||||
|
||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
||||
|
||||
/// Discover all exchanges where this ISIN trades by querying Yahoo Finance
|
||||
pub async fn discover_available_exchanges(isin: &str, known_ticker: &str) -> anyhow::Result<Vec<TickerInfo>> {
|
||||
println!(" Discovering exchanges for ISIN {}", isin);
|
||||
|
||||
let mut discovered_tickers = Vec::new();
|
||||
|
||||
// Try the primary ticker first
|
||||
if let Ok(info) = check_ticker_exists(known_ticker).await {
|
||||
discovered_tickers.push(info);
|
||||
}
|
||||
|
||||
// Search for ISIN directly on Yahoo to find other listings
|
||||
let search_url = format!(
|
||||
"https://query2.finance.yahoo.com/v1/finance/search?q={}"esCount=20&newsCount=0",
|
||||
isin
|
||||
);
|
||||
|
||||
match HttpClient::new()
|
||||
.get(&search_url)
|
||||
.header("User-Agent", USER_AGENT)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(resp) => {
|
||||
if let Ok(json) = resp.json::<Value>().await {
|
||||
if let Some(quotes) = json["quotes"].as_array() {
|
||||
for quote in quotes {
|
||||
// First: filter by quoteType directly from search results (faster rejection)
|
||||
let quote_type = quote["quoteType"].as_str().unwrap_or("");
|
||||
if quote_type.to_uppercase() != "EQUITY" {
|
||||
continue; // Skip bonds, ETFs, mutual funds, options, etc.
|
||||
}
|
||||
|
||||
if let Some(symbol) = quote["symbol"].as_str() {
|
||||
// Avoid duplicates
|
||||
if discovered_tickers.iter().any(|t: &TickerInfo| t.ticker == symbol) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Double-check with full quote data (some search results are misleading)
|
||||
match check_ticker_exists(symbol).await {
|
||||
Ok(info) => {
|
||||
println!(" Found equity listing: {} on {} ({})",
|
||||
symbol, info.exchange_mic, info.currency);
|
||||
discovered_tickers.push(info);
|
||||
}
|
||||
Err(e) => {
|
||||
// Most common: it's not actually equity or not tradable
|
||||
// println!(" Rejected {}: {}", symbol, e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Be respectful to Yahoo
|
||||
sleep(TokioDuration::from_millis(120)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => println!(" Search API error: {}", e),
|
||||
}
|
||||
|
||||
// Also try common exchange suffixes for the base ticker
|
||||
if let Some(base) = known_ticker.split('.').next() {
|
||||
let suffixes = vec![
|
||||
"", // US
|
||||
".L", // London
|
||||
".DE", // Frankfurt/XETRA
|
||||
".PA", // Paris
|
||||
".AS", // Amsterdam
|
||||
".MI", // Milan
|
||||
".SW", // Switzerland
|
||||
".T", // Tokyo
|
||||
".HK", // Hong Kong
|
||||
".SS", // Shanghai
|
||||
".SZ", // Shenzhen
|
||||
".TO", // Toronto
|
||||
".AX", // Australia
|
||||
".SA", // Brazil
|
||||
".MC", // Madrid
|
||||
".BO", // Bombay
|
||||
".NS", // National Stock Exchange India
|
||||
];
|
||||
|
||||
for suffix in suffixes {
|
||||
let test_ticker = format!("{}{}", base, suffix);
|
||||
|
||||
// Skip if already found
|
||||
if discovered_tickers.iter().any(|t| t.ticker == test_ticker) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(info) = check_ticker_exists(&test_ticker).await {
|
||||
discovered_tickers.push(info);
|
||||
sleep(TokioDuration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!(" Found {} tradable exchanges", discovered_tickers.len());
|
||||
Ok(discovered_tickers)
|
||||
}
|
||||
|
||||
/// Check if a ticker exists and get its exchange/currency info
|
||||
async fn check_ticker_exists(ticker: &str) -> anyhow::Result<TickerInfo> {
|
||||
let url = format!(
|
||||
"https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}?modules=price",
|
||||
ticker
|
||||
);
|
||||
|
||||
let resp = HttpClient::new()
|
||||
.get(&url)
|
||||
.header("User-Agent", USER_AGENT)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let json: Value = resp.json().await?;
|
||||
|
||||
if let Some(result) = json["quoteSummary"]["result"].as_array() {
|
||||
if result.is_empty() {
|
||||
return Err(anyhow::anyhow!("No quote data for {}", ticker));
|
||||
}
|
||||
|
||||
let quote = &result[0]["price"];
|
||||
|
||||
// CRITICAL: Only accept EQUITY securities
|
||||
let quote_type = quote["quoteType"]
|
||||
.as_str()
|
||||
.unwrap_or("")
|
||||
.to_uppercase();
|
||||
|
||||
if quote_type != "EQUITY" {
|
||||
// Optional: debug what was filtered
|
||||
println!(" → Skipping {} (quoteType: {})", ticker, quote_type);
|
||||
return Err(anyhow::anyhow!("Not an equity: {}", quote_type));
|
||||
}
|
||||
|
||||
let exchange = quote["exchange"].as_str().unwrap_or("");
|
||||
let currency = quote["currency"].as_str().unwrap_or("USD");
|
||||
let short_name = quote["shortName"].as_str().unwrap_or("");
|
||||
|
||||
// Optional: extra sanity — make sure it's not a bond masquerading as equity
|
||||
if short_name.to_uppercase().contains("BOND") ||
|
||||
short_name.to_uppercase().contains("NOTE") ||
|
||||
short_name.to_uppercase().contains("DEBENTURE") {
|
||||
return Err(anyhow::anyhow!("Name suggests debt security"));
|
||||
}
|
||||
|
||||
if !exchange.is_empty() {
|
||||
return Ok(TickerInfo {
|
||||
ticker: ticker.to_string(),
|
||||
exchange_mic: exchange.to_string(),
|
||||
currency: currency.to_string(),
|
||||
primary: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!("Invalid or missing data for {}", ticker))
|
||||
}
|
||||
|
||||
/// Convert Yahoo's exchange name to MIC code (best effort)
|
||||
fn exchange_name_to_mic(name: &str) -> String {
|
||||
match name {
|
||||
"NMS" | "NasdaqGS" | "NASDAQ" => "XNAS",
|
||||
"NYQ" | "NYSE" => "XNYS",
|
||||
"LSE" | "London" => "XLON",
|
||||
"FRA" | "Frankfurt" | "GER" | "XETRA" => "XFRA",
|
||||
"PAR" | "Paris" => "XPAR",
|
||||
"AMS" | "Amsterdam" => "XAMS",
|
||||
"MIL" | "Milan" => "XMIL",
|
||||
"JPX" | "Tokyo" => "XJPX",
|
||||
"HKG" | "Hong Kong" => "XHKG",
|
||||
"SHH" | "Shanghai" => "XSHG",
|
||||
"SHZ" | "Shenzhen" => "XSHE",
|
||||
"TOR" | "Toronto" => "XTSE",
|
||||
"ASX" | "Australia" => "XASX",
|
||||
"SAU" | "Saudi" => "XSAU",
|
||||
"SWX" | "Switzerland" => "XSWX",
|
||||
"BSE" | "Bombay" => "XBSE",
|
||||
"NSE" | "NSI" => "XNSE",
|
||||
"TAI" | "Taiwan" => "XTAI",
|
||||
"SAO" | "Sao Paulo" => "BVMF",
|
||||
"MCE" | "Madrid" => "XMAD",
|
||||
_ => name, // Fallback to name itself
|
||||
}.to_string()
|
||||
}
|
||||
|
||||
pub async fn dismiss_yahoo_consent(client: &Client) -> anyhow::Result<()> {
|
||||
let script = r#"
|
||||
(() => {
|
||||
const agree = document.querySelector('button[name="agree"]');
|
||||
if (agree) {
|
||||
agree.click();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
})()
|
||||
"#;
|
||||
|
||||
for _ in 0..10 {
|
||||
let done: bool = client.execute(script, vec![]).await?.as_bool().unwrap_or(false);
|
||||
if done {
|
||||
break;
|
||||
}
|
||||
sleep(TokioDuration::from_millis(500)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn fetch_earnings_history(client: &Client, ticker: &str) -> anyhow::Result<Vec<CompanyEvent>> {
|
||||
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker);
|
||||
client.goto(&url).await?;
|
||||
dismiss_yahoo_consent(client).await?;
|
||||
|
||||
loop {
|
||||
match client.find(Locator::XPath(r#"//button[contains(text(), 'Show More')]"#)).await {
|
||||
Ok(btn) => {
|
||||
btn.click().await?;
|
||||
sleep(TokioDuration::from_secs(2)).await;
|
||||
}
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
|
||||
let html = client.source().await?;
|
||||
let document = Html::parse_document(&html);
|
||||
let row_sel = Selector::parse("table tbody tr").unwrap();
|
||||
let mut events = Vec::new();
|
||||
|
||||
for row in document.select(&row_sel) {
|
||||
let cols: Vec<String> = row.select(&Selector::parse("td").unwrap())
|
||||
.map(|td| td.text().collect::<Vec<_>>().join(" ").trim().to_string())
|
||||
.collect();
|
||||
if cols.len() < 6 { continue; }
|
||||
|
||||
let full_date = &cols[2];
|
||||
let parts: Vec<&str> = full_date.split(" at ").collect();
|
||||
let raw_date = parts[0].trim();
|
||||
let time_str = if parts.len() > 1 { parts[1].trim() } else { "" };
|
||||
|
||||
let date = match parse_yahoo_date(raw_date) {
|
||||
Ok(d) => d,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
let eps_forecast = parse_float(&cols[3]);
|
||||
let eps_actual = if cols[4] == "-" { None } else { parse_float(&cols[4]) };
|
||||
|
||||
let surprise_pct = if let (Some(f), Some(a)) = (eps_forecast, eps_actual) {
|
||||
if f.abs() > 0.001 { Some((a - f) / f.abs() * 100.0) } else { None }
|
||||
} else { None };
|
||||
|
||||
let time = if time_str.contains("PM") {
|
||||
"AMC".to_string()
|
||||
} else if time_str.contains("AM") {
|
||||
"BMO".to_string()
|
||||
} else {
|
||||
"".to_string()
|
||||
};
|
||||
|
||||
events.push(CompanyEvent {
|
||||
ticker: ticker.to_string(),
|
||||
date: date.format("%Y-%m-%d").to_string(),
|
||||
time,
|
||||
period: "".to_string(),
|
||||
eps_forecast,
|
||||
eps_actual,
|
||||
revenue_forecast: None,
|
||||
revenue_actual: None,
|
||||
surprise_pct,
|
||||
source: "Yahoo".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(events)
|
||||
}
|
||||
|
||||
fn parse_price(v: Option<&Value>) -> f64 {
|
||||
v.and_then(|x| x.as_str())
|
||||
.and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
|
||||
.or_else(|| v.and_then(|x| x.as_f64()))
|
||||
.unwrap_or(0.0)
|
||||
}
|
||||
|
||||
fn parse_volume(v: Option<&Value>) -> u64 {
|
||||
v.and_then(|x| x.as_str())
|
||||
.and_then(|s| s.replace(',', "").parse::<u64>().ok())
|
||||
.or_else(|| v.and_then(|x| x.as_u64()))
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
pub async fn fetch_daily_price_history(
|
||||
ticker: &str,
|
||||
start_str: &str,
|
||||
end_str: &str,
|
||||
) -> anyhow::Result<Vec<CompanyPrice>> {
|
||||
let start = NaiveDate::parse_from_str(start_str, "%Y-%m-%d")?;
|
||||
let end = NaiveDate::parse_from_str(end_str, "%Y-%m-%d")? + Duration::days(1);
|
||||
|
||||
let mut all_prices = Vec::new();
|
||||
let mut current = start;
|
||||
|
||||
while current < end {
|
||||
let chunk_end = current + Duration::days(730);
|
||||
let actual_end = chunk_end.min(end);
|
||||
|
||||
let period1 = current.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
|
||||
let period2 = actual_end.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
|
||||
|
||||
println!(" Fetching {ticker} {} → {}", current, actual_end - Duration::days(1));
|
||||
|
||||
let url = format!(
|
||||
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=1d&includeAdjustedClose=true"
|
||||
);
|
||||
|
||||
let json: Value = HttpClient::new()
|
||||
.get(&url)
|
||||
.header("User-Agent", USER_AGENT)
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
|
||||
let result = &json["chart"]["result"][0];
|
||||
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
|
||||
let quote = &result["indicators"]["quote"][0];
|
||||
let meta = &result["meta"];
|
||||
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
|
||||
|
||||
let opens = quote["open"].as_array();
|
||||
let highs = quote["high"].as_array();
|
||||
let lows = quote["low"].as_array();
|
||||
let closes = quote["close"].as_array();
|
||||
let adj_closes = result["indicators"]["adjclose"][0]["adjclose"].as_array()
|
||||
.or_else(|| closes);
|
||||
let volumes = quote["volume"].as_array();
|
||||
|
||||
for (i, ts_val) in timestamps.iter().enumerate() {
|
||||
let ts = ts_val.as_i64().unwrap_or(0);
|
||||
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
|
||||
let date_str = dt.format("%Y-%m-%d").to_string();
|
||||
|
||||
if date_str < start_str.to_string() || date_str > end_str.to_string() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let open = parse_price(opens.and_then(|a| a.get(i)));
|
||||
let high = parse_price(highs.and_then(|a| a.get(i)));
|
||||
let low = parse_price(lows.and_then(|a| a.get(i)));
|
||||
let close = parse_price(closes.and_then(|a| a.get(i)));
|
||||
let adj_close = parse_price(adj_closes.and_then(|a| a.get(i)));
|
||||
let volume = parse_volume(volumes.and_then(|a| a.get(i)));
|
||||
|
||||
all_prices.push(CompanyPrice {
|
||||
ticker: ticker.to_string(),
|
||||
date: date_str,
|
||||
time: "".to_string(),
|
||||
open,
|
||||
high,
|
||||
low,
|
||||
close,
|
||||
adj_close,
|
||||
volume,
|
||||
currency: currency.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
sleep(TokioDuration::from_millis(200)).await;
|
||||
current = actual_end;
|
||||
}
|
||||
|
||||
all_prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
||||
all_prices.dedup_by(|a, b| a.date == b.date && a.time == b.time);
|
||||
|
||||
println!(" Got {} daily bars for {ticker}", all_prices.len());
|
||||
Ok(all_prices)
|
||||
}
|
||||
|
||||
pub async fn fetch_price_history_5min(
|
||||
ticker: &str,
|
||||
_start: &str,
|
||||
_end: &str,
|
||||
) -> anyhow::Result<Vec<CompanyPrice>> {
|
||||
let now = Utc::now().timestamp();
|
||||
let period1 = now - 5184000;
|
||||
let period2 = now;
|
||||
|
||||
let url = format!(
|
||||
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=5m&includeAdjustedClose=true"
|
||||
);
|
||||
|
||||
let json: Value = HttpClient::new()
|
||||
.get(&url)
|
||||
.header("User-Agent", USER_AGENT)
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
|
||||
let result = &json["chart"]["result"][0];
|
||||
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
|
||||
let quote = &result["indicators"]["quote"][0];
|
||||
let meta = &result["meta"];
|
||||
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
|
||||
|
||||
let mut prices = Vec::new();
|
||||
|
||||
for (i, ts_val) in timestamps.iter().enumerate() {
|
||||
let ts = ts_val.as_i64().unwrap_or(0);
|
||||
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
|
||||
let date_str = dt.format("%Y-%m-%d").to_string();
|
||||
let time_str = dt.format("%H:%M:%S").to_string();
|
||||
|
||||
let open = parse_price(quote["open"].as_array().and_then(|a| a.get(i)));
|
||||
let high = parse_price(quote["high"].as_array().and_then(|a| a.get(i)));
|
||||
let low = parse_price(quote["low"].as_array().and_then(|a| a.get(i)));
|
||||
let close = parse_price(quote["close"].as_array().and_then(|a| a.get(i)));
|
||||
let volume = parse_volume(quote["volume"].as_array().and_then(|a| a.get(i)));
|
||||
|
||||
prices.push(CompanyPrice {
|
||||
ticker: ticker.to_string(),
|
||||
date: date_str,
|
||||
time: time_str,
|
||||
open,
|
||||
high,
|
||||
low,
|
||||
close,
|
||||
adj_close: close,
|
||||
volume,
|
||||
currency: currency.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
||||
Ok(prices)
|
||||
}
|
||||
|
||||
/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF
|
||||
/// Overengineered; we could just use the static URL, but this shows how to scrape if needed
|
||||
pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> {
|
||||
let url = format!("https://www.gleif.org/de/lei-data/lei-mapping/download-isin-to-lei-relationship-files");
|
||||
client.goto(&url).await?;
|
||||
|
||||
let html = client.source().await?;
|
||||
let _document = Html::parse_document(&html);
|
||||
let _row_sel = Selector::parse("table tbody tr").unwrap();
|
||||
let isin_lei = "".to_string();
|
||||
|
||||
Ok(isin_lei)
|
||||
}
|
||||
|
||||
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
||||
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
||||
let zip_path = "data/isin_lei.zip";
|
||||
let csv_path = "data/isin_lei.csv";
|
||||
|
||||
if let Err(e) = std::fs::create_dir_all("data") {
|
||||
println!("Failed to create data directory: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Download ZIP
|
||||
let bytes = match reqwest::Client::builder()
|
||||
.user_agent(USER_AGENT)
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.build()
|
||||
.and_then(|c| Ok(c))
|
||||
{
|
||||
Ok(client) => match client.get(url).send().await {
|
||||
Ok(resp) if resp.status().is_success() => match resp.bytes().await {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
println!("Failed to read ZIP bytes: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
},
|
||||
Ok(resp) => {
|
||||
println!("Server returned HTTP {}", resp.status());
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Failed to download ISIN/LEI ZIP: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
println!("Failed to create HTTP client: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = tokio::fs::write(zip_path, &bytes).await {
|
||||
println!("Failed to write ZIP file: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Extract CSV
|
||||
let archive = match std::fs::File::open(zip_path)
|
||||
.map(ZipArchive::new)
|
||||
{
|
||||
Ok(Ok(a)) => a,
|
||||
Ok(Err(e)) => {
|
||||
println!("Invalid ZIP: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Cannot open ZIP file: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let mut archive = archive;
|
||||
|
||||
let idx = match (0..archive.len()).find(|&i| {
|
||||
archive.by_index(i)
|
||||
.map(|f| f.name().ends_with(".csv"))
|
||||
.unwrap_or(false)
|
||||
}) {
|
||||
Some(i) => i,
|
||||
None => {
|
||||
println!("ZIP did not contain a CSV file");
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let mut csv_file = match archive.by_index(idx) {
|
||||
Ok(f) => f,
|
||||
Err(e) => {
|
||||
println!("Failed to read CSV entry: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let mut csv_bytes = Vec::new();
|
||||
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
||||
println!("Failed to extract CSV: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if let Err(e) = tokio::fs::write(csv_path, &csv_bytes).await {
|
||||
println!("Failed to save CSV file: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(csv_path.to_string()))
|
||||
}
|
||||
|
||||
|
||||
pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
|
||||
// 1. Download + extract the CSV (this is now async)
|
||||
let csv_path = match download_isin_lei_csv().await? {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
println!("ISIN/LEI download failed; continuing with empty map");
|
||||
return Ok(HashMap::new());
|
||||
}
|
||||
};
|
||||
|
||||
// 2. Open and parse the CSV synchronously (fast enough, ~8M lines is fine)
|
||||
let file = match std::fs::File::open(&csv_path) {
|
||||
Ok(f) => f,
|
||||
Err(e) => {
|
||||
println!("Cannot open CSV '{}': {}", csv_path, e);
|
||||
return Ok(HashMap::new());
|
||||
}
|
||||
};
|
||||
|
||||
let mut rdr = csv::ReaderBuilder::new()
|
||||
.has_headers(false)
|
||||
.from_reader(std::io::BufReader::new(file));
|
||||
|
||||
let mut map: HashMap<String, Vec<String>> = HashMap::new();
|
||||
|
||||
for result in rdr.records() {
|
||||
let record = match result {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
println!("CSV parse error: {}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if record.len() < 2 { continue; }
|
||||
|
||||
let lei = record[0].to_string();
|
||||
let isin = record[1].to_string();
|
||||
map.entry(lei).or_default().push(isin);
|
||||
}
|
||||
|
||||
println!("Loaded ISIN↔LEI map with {} LEIs and {} total ISINs",
|
||||
map.len(),
|
||||
map.values().map(|v| v.len()).sum::<usize>()
|
||||
);
|
||||
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
pub async fn get_primary_isin_and_name(
|
||||
client: &Client, // Pass your existing Selenium client
|
||||
ticker: &str,
|
||||
) -> anyhow::Result<PrimaryInfo> {
|
||||
// Navigate to the actual quote page (always works)
|
||||
let quote_url = format!("https://finance.yahoo.com/quote/{}", ticker);
|
||||
client.goto("e_url).await?;
|
||||
|
||||
// Dismiss overlays/banners (your function + guce-specific)
|
||||
reject_yahoo_cookies(client).await?;
|
||||
|
||||
// Wait for page to load (key data elements)
|
||||
sleep(TokioDuration::from_millis(2000)).await;
|
||||
|
||||
// Get page HTML and parse
|
||||
let html = client.source().await?;
|
||||
let document = Html::parse_document(&html);
|
||||
|
||||
// Selectors for key fields (tested on real Yahoo pages Nov 2025)
|
||||
let name_sel = Selector::parse("h1[data-testid='qsp-price-header']").unwrap_or_else(|_| Selector::parse("h1").unwrap());
|
||||
let isin_sel = Selector::parse("[data-testid='qsp-symbol'] + div [data-field='isin']").unwrap_or_else(|_| Selector::parse("[data-field='isin']").unwrap());
|
||||
let exchange_sel = Selector::parse("[data-testid='qsp-market'] span").unwrap_or_else(|_| Selector::parse(".TopNav__Exchange").unwrap());
|
||||
let currency_sel = Selector::parse("[data-testid='qsp-price'] span:contains('USD')").unwrap_or_else(|_| Selector::parse(".TopNav__Currency").unwrap()); // Adjust for dynamic
|
||||
|
||||
let name_elem = document.select(&name_sel).next().map(|e| e.text().collect::<String>().trim().to_string());
|
||||
let isin_elem = document.select(&isin_sel).next().map(|e| e.text().collect::<String>().trim().to_uppercase());
|
||||
let exchange_elem = document.select(&exchange_sel).next().map(|e| e.text().collect::<String>().trim().to_string());
|
||||
let currency_elem = document.select(¤cy_sel).next().map(|e| e.text().collect::<String>().trim().to_string());
|
||||
|
||||
let name = name_elem.unwrap_or_else(|| ticker.to_string());
|
||||
let isin = isin_elem.unwrap_or_default();
|
||||
let exchange_mic = exchange_elem.unwrap_or_default();
|
||||
let currency = currency_elem.unwrap_or_else(|| "USD".to_string());
|
||||
|
||||
// Validate ISIN
|
||||
let valid_isin = if isin.len() == 12 && isin.chars().all(|c| c.is_alphanumeric()) {
|
||||
isin
|
||||
} else {
|
||||
"".to_string()
|
||||
};
|
||||
|
||||
println!(" → Scraped {}: {} | ISIN: {} | Exchange: {}", ticker, name, valid_isin, exchange_mic);
|
||||
|
||||
Ok(PrimaryInfo {
|
||||
isin: valid_isin,
|
||||
name,
|
||||
exchange_mic,
|
||||
currency,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
|
||||
for _ in 0..10 {
|
||||
let clicked: bool = client
|
||||
.execute(
|
||||
r#"(() => {
|
||||
const btn = document.querySelector('#consent-page .reject-all');
|
||||
if (btn) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
})()"#,
|
||||
vec![],
|
||||
)
|
||||
.await?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
|
||||
if clicked { break; }
|
||||
sleep(TokioDuration::from_millis(500)).await;
|
||||
}
|
||||
|
||||
println!("Rejected Yahoo cookies if button existed");
|
||||
Ok(())
|
||||
}
|
||||
252
src/corporate/storage.rs
Normal file
252
src/corporate/storage.rs
Normal file
@@ -0,0 +1,252 @@
|
||||
// src/corporate/storage.rs
|
||||
use super::{types::*, helpers::*, scraper::get_primary_isin_and_name};
|
||||
use crate::config;
|
||||
|
||||
use tokio::fs;
|
||||
use chrono::{Datelike, NaiveDate};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
pub async fn load_existing_events() -> anyhow::Result<HashMap<String, CompanyEvent>> {
|
||||
let mut map = HashMap::new();
|
||||
let dir = std::path::Path::new("corporate_events");
|
||||
if !dir.exists() {
|
||||
return Ok(map);
|
||||
}
|
||||
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
if name.starts_with("events_") && name.len() == 17 {
|
||||
let content = fs::read_to_string(&path).await?;
|
||||
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
||||
for event in events {
|
||||
map.insert(event_key(&event), event);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_events(events: HashMap<String, CompanyEvent>) -> anyhow::Result<()> {
|
||||
let dir = std::path::Path::new("corporate_events");
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
fs::remove_file(&path).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| (e.ticker.clone(), e.date.clone()));
|
||||
|
||||
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
|
||||
for e in sorted {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") {
|
||||
let key = format!("{}-{:02}", d.year(), d.month());
|
||||
by_month.entry(key).or_default().push(e);
|
||||
}
|
||||
}
|
||||
|
||||
for (month, list) in by_month {
|
||||
let path = dir.join(format!("events_{}.json", month));
|
||||
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_changes(changes: &[CompanyEventChange]) -> anyhow::Result<()> {
|
||||
if changes.is_empty() { return Ok(()); }
|
||||
let dir = std::path::Path::new("corporate_event_changes");
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
|
||||
for c in changes {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||
let key = format!("{}-{:02}", d.year(), d.month());
|
||||
by_month.entry(key).or_default().push(c.clone());
|
||||
}
|
||||
}
|
||||
|
||||
for (month, list) in by_month {
|
||||
let path = dir.join(format!("changes_{}.json", month));
|
||||
let mut all = if path.exists() {
|
||||
let s = fs::read_to_string(&path).await?;
|
||||
serde_json::from_str(&s).unwrap_or_default()
|
||||
} else { vec![] };
|
||||
all.extend(list);
|
||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_prices_for_ticker(ticker: &str, timeframe: &str, mut prices: Vec<CompanyPrice>) -> anyhow::Result<()> {
|
||||
let base_dir = Path::new("corporate_prices");
|
||||
let company_dir = base_dir.join(ticker.replace(".", "_"));
|
||||
let timeframe_dir = company_dir.join(timeframe);
|
||||
|
||||
fs::create_dir_all(&timeframe_dir).await?;
|
||||
let path = timeframe_dir.join("prices.json");
|
||||
|
||||
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
||||
|
||||
let json = serde_json::to_string_pretty(&prices)?;
|
||||
fs::write(&path, json).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn _load_companies() -> Result<Vec<CompanyMetadata>, anyhow::Error> {
|
||||
let path = Path::new("src/data/companies.json");
|
||||
if !path.exists() {
|
||||
println!("Missing companies.json file at src/data/companies.json");
|
||||
return Ok(vec![]);
|
||||
}
|
||||
let content = fs::read_to_string(path).await?;
|
||||
let companies: Vec<CompanyMetadata> = serde_json::from_str(&content)?;
|
||||
Ok(companies)
|
||||
}
|
||||
|
||||
pub fn get_company_dir(lei: &str) -> PathBuf {
|
||||
PathBuf::from("corporate_prices").join(lei)
|
||||
}
|
||||
|
||||
pub async fn ensure_company_dirs(isin: &str) -> anyhow::Result<()> {
|
||||
let base = get_company_dir(isin);
|
||||
let paths = [
|
||||
base.clone(),
|
||||
base.join("5min"),
|
||||
base.join("daily"),
|
||||
base.join("aggregated").join("5min"),
|
||||
base.join("aggregated").join("daily"),
|
||||
];
|
||||
for p in paths {
|
||||
fs::create_dir_all(&p).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_company_metadata(company: &CompanyMetadata) -> anyhow::Result<()> {
|
||||
let dir = get_company_dir(&company.lei);
|
||||
fs::create_dir_all(&dir).await?;
|
||||
let path = dir.join("metadata.json");
|
||||
fs::write(&path, serde_json::to_string_pretty(company)?).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn load_company_metadata(lei: &str) -> anyhow::Result<CompanyMetadata> {
|
||||
let path = get_company_dir(lei).join("metadata.json");
|
||||
let content = fs::read_to_string(path).await?;
|
||||
Ok(serde_json::from_str(&content)?)
|
||||
}
|
||||
|
||||
pub async fn save_available_exchanges(isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
|
||||
let dir = get_company_dir(isin);
|
||||
fs::create_dir_all(&dir).await?;
|
||||
let path = dir.join("available_exchanges.json");
|
||||
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn load_available_exchanges(lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
|
||||
let path = get_company_dir(lei).join("available_exchanges.json");
|
||||
if path.exists() {
|
||||
let content = fs::read_to_string(&path).await?;
|
||||
Ok(serde_json::from_str(&content)?)
|
||||
} else {
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn save_prices_by_source(
|
||||
lei: &str,
|
||||
source_ticker: &str,
|
||||
timeframe: &str,
|
||||
prices: Vec<CompanyPrice>,
|
||||
) -> anyhow::Result<()> {
|
||||
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
|
||||
let dir = get_company_dir(lei).join(timeframe).join(&source_safe);
|
||||
fs::create_dir_all(&dir).await?;
|
||||
let path = dir.join("prices.json");
|
||||
let mut prices = prices;
|
||||
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
||||
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update available_exchanges.json with fetch results
|
||||
pub async fn update_available_exchange(
|
||||
isin: &str,
|
||||
ticker: &str,
|
||||
exchange_mic: &str,
|
||||
has_daily: bool,
|
||||
has_5min: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut exchanges = load_available_exchanges(isin).await?;
|
||||
|
||||
if let Some(entry) = exchanges.iter_mut().find(|e| e.ticker == ticker) {
|
||||
// Update existing entry
|
||||
entry.record_success(has_daily, has_5min);
|
||||
} else {
|
||||
// Create new entry - need to get currency from somewhere
|
||||
// Try to infer from the ticker or use a default
|
||||
let currency = infer_currency_from_ticker(ticker);
|
||||
let mut new_entry = AvailableExchange::new(
|
||||
ticker.to_string(),
|
||||
exchange_mic.to_string(),
|
||||
currency,
|
||||
);
|
||||
new_entry.record_success(has_daily, has_5min);
|
||||
exchanges.push(new_entry);
|
||||
}
|
||||
|
||||
save_available_exchanges(isin, exchanges).await
|
||||
}
|
||||
|
||||
/// Add a newly discovered exchange before fetching
|
||||
pub async fn add_discovered_exchange(
|
||||
isin: &str,
|
||||
ticker_info: &TickerInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut exchanges = load_available_exchanges(isin).await?;
|
||||
|
||||
// Only add if not already present
|
||||
if !exchanges.iter().any(|e| e.ticker == ticker_info.ticker) {
|
||||
let new_entry = AvailableExchange::new(
|
||||
ticker_info.ticker.clone(),
|
||||
ticker_info.exchange_mic.clone(),
|
||||
ticker_info.currency.clone(),
|
||||
);
|
||||
exchanges.push(new_entry);
|
||||
save_available_exchanges(isin, exchanges).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Infer currency from ticker suffix
|
||||
fn infer_currency_from_ticker(ticker: &str) -> String {
|
||||
if ticker.ends_with(".L") { return "GBP".to_string(); }
|
||||
if ticker.ends_with(".PA") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".DE") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".AS") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".MI") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".SW") { return "CHF".to_string(); }
|
||||
if ticker.ends_with(".T") { return "JPY".to_string(); }
|
||||
if ticker.ends_with(".HK") { return "HKD".to_string(); }
|
||||
if ticker.ends_with(".SS") { return "CNY".to_string(); }
|
||||
if ticker.ends_with(".SZ") { return "CNY".to_string(); }
|
||||
if ticker.ends_with(".TO") { return "CAD".to_string(); }
|
||||
if ticker.ends_with(".AX") { return "AUD".to_string(); }
|
||||
if ticker.ends_with(".SA") { return "BRL".to_string(); }
|
||||
if ticker.ends_with(".MC") { return "EUR".to_string(); }
|
||||
if ticker.ends_with(".BO") || ticker.ends_with(".NS") { return "INR".to_string(); }
|
||||
|
||||
"USD".to_string() // Default
|
||||
}
|
||||
104
src/corporate/types.rs
Normal file
104
src/corporate/types.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
// src/corporate/types.rs
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct CompanyEvent {
|
||||
pub ticker: String,
|
||||
pub date: String, // YYYY-MM-DD
|
||||
pub time: String, // "AMC", "BMO", "TAS", or ""
|
||||
pub period: String, // "Q1 2025", "FY 2024"
|
||||
pub eps_forecast: Option<f64>,
|
||||
pub eps_actual: Option<f64>,
|
||||
pub revenue_forecast: Option<f64>,
|
||||
pub revenue_actual: Option<f64>,
|
||||
pub surprise_pct: Option<f64>, // (actual - forecast) / |forecast|
|
||||
pub source: String, // "Yahoo"
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompanyPrice {
|
||||
pub ticker: String,
|
||||
pub date: String, // YYYY-MM-DD
|
||||
pub time: String, // HH:MM:SS for intraday, "" for daily
|
||||
pub open: f64,
|
||||
pub high: f64,
|
||||
pub low: f64,
|
||||
pub close: f64,
|
||||
pub adj_close: f64,
|
||||
pub volume: u64,
|
||||
pub currency: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompanyEventChange {
|
||||
pub ticker: String,
|
||||
pub date: String,
|
||||
pub field_changed: String, // "time", "eps_forecast", "eps_actual", "new_event"
|
||||
pub old_value: String,
|
||||
pub new_value: String,
|
||||
pub detected_at: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TickerInfo {
|
||||
pub ticker: String,
|
||||
pub exchange_mic: String,
|
||||
pub currency: String,
|
||||
pub isin: String, // ISIN belonging to this legal entity (primary + ADR + GDR)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompanyMetadata {
|
||||
pub lei: String,
|
||||
pub figi: Option<String>,
|
||||
pub name: String,
|
||||
pub primary_isin: String, // The most liquid / preferred one (used for folder fallback)
|
||||
pub tickers: Vec<TickerInfo>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PrimaryInfo {
|
||||
pub isin: String,
|
||||
pub name: String,
|
||||
pub exchange_mic: String,
|
||||
pub currency: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AvailableExchange {
|
||||
pub exchange_mic: String,
|
||||
pub ticker: String,
|
||||
pub has_daily: bool,
|
||||
pub has_5min: bool,
|
||||
pub last_successful_fetch: Option<String>, // YYYY-MM-DD
|
||||
#[serde(default)]
|
||||
pub currency: String,
|
||||
#[serde(default)]
|
||||
pub discovered_at: Option<String>, // When this exchange was first discovered
|
||||
#[serde(default)]
|
||||
pub fetch_count: u32, // How many times successfully fetched
|
||||
}
|
||||
|
||||
impl AvailableExchange {
|
||||
pub fn new(ticker: String, exchange_mic: String, currency: String) -> Self {
|
||||
Self {
|
||||
exchange_mic,
|
||||
ticker,
|
||||
has_daily: false,
|
||||
has_5min: false,
|
||||
last_successful_fetch: None,
|
||||
currency,
|
||||
discovered_at: Some(chrono::Local::now().format("%Y-%m-%d").to_string()),
|
||||
fetch_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_success(&mut self, has_daily: bool, has_5min: bool) {
|
||||
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
|
||||
|
||||
self.has_daily |= has_daily;
|
||||
self.has_5min |= has_5min;
|
||||
self.last_successful_fetch = Some(today);
|
||||
self.fetch_count += 1;
|
||||
}
|
||||
}
|
||||
233
src/corporate/update.rs
Normal file
233
src/corporate/update.rs
Normal file
@@ -0,0 +1,233 @@
|
||||
// src/corporate/update.rs
|
||||
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
|
||||
use crate::config::Config;
|
||||
|
||||
use chrono::Local;
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub async fn run_full_update(client: &fantoccini::Client, config: &Config) -> anyhow::Result<()> {
|
||||
println!("Starting LEI-based corporate update");
|
||||
|
||||
// 1. Download fresh GLEIF ISIN↔LEI mapping on every run
|
||||
let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
|
||||
Ok(map) => map,
|
||||
Err(e) => {
|
||||
println!("Warning: Failed to load ISIN↔LEI mapping: {}", e);
|
||||
HashMap::new()
|
||||
}
|
||||
};
|
||||
|
||||
let figi_to_lei: HashMap<String, String> = match build_figi_to_lei_map(&lei_to_isins).await {
|
||||
Ok(map) => map,
|
||||
Err(e) => {
|
||||
println!("Warning: Failed to build FIGI→LEI map: {}", e);
|
||||
HashMap::new()
|
||||
}
|
||||
};
|
||||
|
||||
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
|
||||
let mut existing_events = load_existing_events().await?;
|
||||
|
||||
let mut companies: Vec<CompanyMetadata> = match load_or_build_companies_figi(&lei_to_isins, &figi_to_lei).await {
|
||||
Ok(comps) => comps,
|
||||
Err(e) => {
|
||||
println!("Error loading/building company metadata: {}", e);
|
||||
return Err(e);
|
||||
}
|
||||
}; // Vec<CompanyMetadata> with lei, isins, tickers
|
||||
|
||||
for mut company in companies {
|
||||
println!("\nProcessing company: {} (LEI: {})", company.name, company.lei);
|
||||
|
||||
// === Enrich with ALL ISINs known to GLEIF (includes ADRs, GDRs, etc.) ===
|
||||
if let Some(all_isins) = lei_to_isins.get(&company.lei) {
|
||||
let mut seen = company.isins.iter().cloned().collect::<std::collections::HashSet<_>>();
|
||||
for isin in all_isins {
|
||||
if !seen.contains(isin) {
|
||||
company.isins.push(isin.clone());
|
||||
seen.insert(isin.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure company directory exists (now uses LEI)
|
||||
//let figi_dir = format!("data/companies_by_figi/{}/", company.primary_figi);
|
||||
ensure_company_dirs(&company.lei).await?;
|
||||
save_company_metadata(&company).await?;
|
||||
|
||||
// === STEP 1: Discover additional exchanges using each known ISIN ===
|
||||
let mut all_tickers = company.tickers.clone();
|
||||
|
||||
if let Some(primary_ticker) = company.tickers.iter().find(|t| t.primary) {
|
||||
println!(" Discovering additional exchanges across {} ISIN(s)...", company.isins.len());
|
||||
|
||||
for isin in &company.isins {
|
||||
println!(" → Checking ISIN: {}", isin);
|
||||
match discover_available_exchanges(isin, &primary_ticker.ticker).await {
|
||||
Ok(discovered) => {
|
||||
if discovered.is_empty() {
|
||||
println!(" – No new exchanges found for {}", isin);
|
||||
} else {
|
||||
for disc in discovered {
|
||||
if !all_tickers.iter().any(|t| t.ticker == disc.ticker && t.exchange_mic == disc.exchange_mic) {
|
||||
println!(" New equity listing → {} ({}) via ISIN {}",
|
||||
disc.ticker, disc.exchange_mic, isin);
|
||||
all_tickers.push(disc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => println!(" Discovery failed for {}: {}", isin, e),
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(300)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// Save updated metadata if we found new listings
|
||||
if all_tickers.len() > company.tickers.len() {
|
||||
company.tickers = all_tickers.clone();
|
||||
save_company_metadata(&company).await?;
|
||||
println!(" Updated metadata: {} total tickers", all_tickers.len());
|
||||
}
|
||||
|
||||
// === STEP 2: Fetch data from ALL available tickers ===
|
||||
for ticker_info in &all_tickers {
|
||||
let ticker = &ticker_info.ticker;
|
||||
println!(" → Fetching: {} ({})", ticker, ticker_info.exchange_mic);
|
||||
|
||||
let mut daily_success = false;
|
||||
let mut intraday_success = false;
|
||||
|
||||
// Earnings: only fetch from primary ticker to avoid duplicates
|
||||
if ticker_info.primary {
|
||||
if let Ok(new_events) = fetch_earnings_history(client, ticker).await {
|
||||
let result = process_batch(&new_events, &mut existing_events, &today);
|
||||
save_changes(&result.changes).await?;
|
||||
println!(" Earnings events: {}", new_events.len());
|
||||
}
|
||||
}
|
||||
|
||||
// Daily prices
|
||||
if let Ok(prices) = fetch_daily_price_history(ticker, &config.corporate_start_date, &today).await {
|
||||
if !prices.is_empty() {
|
||||
save_prices_by_source(&company.lei, ticker, "daily", prices).await?;
|
||||
daily_success = true;
|
||||
}
|
||||
}
|
||||
|
||||
// 5-minute intraday (last 60 days)
|
||||
let sixty_days_ago = (chrono::Local::now() - chrono::Duration::days(60))
|
||||
.format("%Y-%m-%d")
|
||||
.to_string();
|
||||
|
||||
if let Ok(prices) = fetch_price_history_5min(ticker, &sixty_days_ago, &today).await {
|
||||
if !prices.is_empty() {
|
||||
save_prices_by_source(&company.lei, ticker, "5min", prices).await?;
|
||||
intraday_success = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Update available_exchanges.json (now under LEI folder)
|
||||
update_available_exchange(
|
||||
&company.lei,
|
||||
ticker,
|
||||
&ticker_info.exchange_mic,
|
||||
daily_success,
|
||||
intraday_success,
|
||||
).await?;
|
||||
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(800)).await;
|
||||
}
|
||||
|
||||
// === STEP 3: Aggregate all sources into unified USD prices ===
|
||||
println!(" Aggregating multi-source price data (FX-adjusted)...");
|
||||
if let Err(e) = aggregate_best_price_data(&company.lei).await {
|
||||
println!(" Aggregation failed: {}", e);
|
||||
} else {
|
||||
println!(" Aggregation complete");
|
||||
}
|
||||
}
|
||||
|
||||
// Final save of optimized earnings events
|
||||
save_optimized_events(existing_events).await?;
|
||||
println!("\nCorporate update complete (LEI-based)");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn enrich_companies_with_leis(
|
||||
companies: &mut Vec<CompanyMetadata>,
|
||||
lei_to_isins: &HashMap<String, Vec<String>>,
|
||||
) {
|
||||
for company in companies.iter_mut() {
|
||||
if company.lei.is_empty() {
|
||||
// Try to find LEI by any known ISIN
|
||||
for isin in &company.isins {
|
||||
for (lei, isins) in lei_to_isins {
|
||||
if isins.contains(isin) {
|
||||
company.lei = lei.clone();
|
||||
println!("Found real LEI {} for {}", lei, company.name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !company.lei.is_empty() { break; }
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: generate fake LEI if still missing
|
||||
if company.lei.is_empty() {
|
||||
company.lei = format!("FAKE{:019}", rand::random::<u64>());
|
||||
println!("No real LEI found → using fake for {}", company.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ProcessResult {
|
||||
pub changes: Vec<CompanyEventChange>,
|
||||
}
|
||||
|
||||
pub fn process_batch(
|
||||
new_events: &[CompanyEvent],
|
||||
existing: &mut HashMap<String, CompanyEvent>,
|
||||
today: &str,
|
||||
) -> ProcessResult {
|
||||
let mut changes = Vec::new();
|
||||
|
||||
for new in new_events {
|
||||
let key = event_key(new);
|
||||
|
||||
if let Some(old) = existing.get(&key) {
|
||||
changes.extend(detect_changes(old, new, today));
|
||||
existing.insert(key, new.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for time change on same date
|
||||
let date_key = format!("{}|{}", new.ticker, new.date);
|
||||
let mut found_old = None;
|
||||
for (k, e) in existing.iter() {
|
||||
if format!("{}|{}", e.ticker, e.date) == date_key && k != &key {
|
||||
found_old = Some((k.clone(), e.clone()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((old_key, old_event)) = found_old {
|
||||
if new.date.as_str() > today {
|
||||
changes.push(CompanyEventChange {
|
||||
ticker: new.ticker.clone(),
|
||||
date: new.date.clone(),
|
||||
field_changed: "time".to_string(),
|
||||
old_value: old_event.time.clone(),
|
||||
new_value: new.time.clone(),
|
||||
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||
});
|
||||
}
|
||||
existing.remove(&old_key);
|
||||
}
|
||||
|
||||
existing.insert(key, new.clone());
|
||||
}
|
||||
|
||||
ProcessResult { changes }
|
||||
}
|
||||
58
src/data/companies.json
Normal file
58
src/data/companies.json
Normal file
@@ -0,0 +1,58 @@
|
||||
[
|
||||
{
|
||||
"lei": "8I5D5ASD7N5Z5P2K9M3J",
|
||||
"isins": ["US46625H1005"],
|
||||
"primary_isin": "US46625H1005",
|
||||
"name": "JPMorgan Chase & Co.",
|
||||
"tickers": [
|
||||
{ "ticker": "JPM", "exchange_mic": "XNYS", "currency": "USD", "primary": true },
|
||||
{ "ticker": "JPM-PC", "exchange_mic": "XNYS", "currency": "USD", "primary": false }
|
||||
]
|
||||
},
|
||||
{
|
||||
"lei": "5493001KJTIIGC8Y1R12",
|
||||
"isins": ["US5949181045"],
|
||||
"primary_isin": "US5949181045",
|
||||
"name": "Microsoft Corporation",
|
||||
"tickers": [
|
||||
{ "ticker": "MSFT", "exchange_mic": "XNAS", "currency": "USD", "primary": true }
|
||||
]
|
||||
},
|
||||
{
|
||||
"lei": "529900T8BM49AURSDO55",
|
||||
"isins": ["CNE000001P37"],
|
||||
"primary_isin": "CNE000001P37",
|
||||
"name": "Industrial and Commercial Bank of China",
|
||||
"tickers": [
|
||||
{ "ticker": "601398.SS", "exchange_mic": "XSHG", "currency": "CNY", "primary": true },
|
||||
{ "ticker": "1398.HK", "exchange_mic": "XHKG", "currency": "HKD", "primary": false }
|
||||
]
|
||||
},
|
||||
{
|
||||
"lei": "519900X5W8K6C1FZ3B57",
|
||||
"isins": ["JP3702200000"],
|
||||
"primary_isin": "JP3702200000",
|
||||
"name": "Toyota Motor Corporation",
|
||||
"tickers": [
|
||||
{ "ticker": "7203.T", "exchange_mic": "XJPX", "currency": "JPY", "primary": true },
|
||||
{ "ticker": "TM", "exchange_mic": "XNYS", "currency": "USD", "primary": false }
|
||||
]
|
||||
},
|
||||
{
|
||||
"lei": "529900T8BM49AURSDO56",
|
||||
"isins": ["HK0000069689"],
|
||||
"primary_isin": "HK0000069689",
|
||||
"name": "Tencent Holdings Limited",
|
||||
"tickers": [
|
||||
{ "ticker": "0700.HK", "exchange_mic": "XHKG", "currency": "HKD", "primary": true },
|
||||
{ "ticker": "TCEHY", "exchange_mic": "OTCM", "currency": "USD", "primary": false }
|
||||
]
|
||||
},
|
||||
{
|
||||
"lei": "8I5D5Q1L7N5Z5P2K9M3J",
|
||||
"isins": ["US90953F1049"],
|
||||
"primary_isin": "US90953F1049",
|
||||
"name": "Test Bonds Filter",
|
||||
"tickers": [{ "ticker": "JPM", "exchange_mic": "XNYS", "currency": "USD", "primary": true }]
|
||||
}
|
||||
]
|
||||
9
src/data/continents.json
Normal file
9
src/data/continents.json
Normal file
@@ -0,0 +1,9 @@
|
||||
[
|
||||
"afrika",
|
||||
"asien",
|
||||
"europa",
|
||||
"nordamerika",
|
||||
"suedamerika",
|
||||
"antarktis",
|
||||
"ozeanien"
|
||||
]
|
||||
54
src/data/countries.json
Normal file
54
src/data/countries.json
Normal file
@@ -0,0 +1,54 @@
|
||||
[
|
||||
"aegypten",
|
||||
"frankreich",
|
||||
"litauen",
|
||||
"schweiz",
|
||||
"argentinien",
|
||||
"griechenland",
|
||||
"mexiko",
|
||||
"singapur",
|
||||
"australien",
|
||||
"hongkong",
|
||||
"neuseeland",
|
||||
"slowakei",
|
||||
"bahrain",
|
||||
"indien",
|
||||
"niederlande",
|
||||
"spanien",
|
||||
"belgien",
|
||||
"indonesien",
|
||||
"norwegen",
|
||||
"suedafrika",
|
||||
"brasilien",
|
||||
"irland",
|
||||
"oesterreich",
|
||||
"suedkorea",
|
||||
"chile",
|
||||
"island",
|
||||
"peru",
|
||||
"taiwan",
|
||||
"china",
|
||||
"italien",
|
||||
"philippinen",
|
||||
"tschechien",
|
||||
"daenemark",
|
||||
"japan",
|
||||
"polen",
|
||||
"tuerkei",
|
||||
"deutschland",
|
||||
"kanada",
|
||||
"portugal",
|
||||
"ungarn",
|
||||
"estland",
|
||||
"katar",
|
||||
"rumaenien",
|
||||
"usa",
|
||||
"eurozone",
|
||||
"kolumbien",
|
||||
"russland",
|
||||
"vereinigte-arabische-emirate",
|
||||
"finnland",
|
||||
"lettland",
|
||||
"schweden",
|
||||
"vereinigtes-koenigreich"
|
||||
]
|
||||
260
src/data/exchanges.json
Normal file
260
src/data/exchanges.json
Normal file
@@ -0,0 +1,260 @@
|
||||
{
|
||||
"exchanges": [
|
||||
{
|
||||
"mic": "XNYS",
|
||||
"name": "New York Stock Exchange",
|
||||
"country": "United States",
|
||||
"city": "New York City",
|
||||
"market_cap_trillion_usd": 30.92,
|
||||
"timezone": "America/New_York",
|
||||
"tz_offset": "-05:00",
|
||||
"dst": "Mar–Nov",
|
||||
"open_local": "09:30",
|
||||
"close_local": "16:00",
|
||||
"lunch_break": false,
|
||||
"open_utc": "14:30",
|
||||
"close_utc": "21:00",
|
||||
"currency": "USD"
|
||||
},
|
||||
{
|
||||
"mic": "XNAS",
|
||||
"name": "Nasdaq",
|
||||
"country": "United States",
|
||||
"city": "New York City",
|
||||
"market_cap_trillion_usd": 31.96,
|
||||
"timezone": "America/New_York",
|
||||
"tz_offset": "-05:00",
|
||||
"dst": "Mar–Nov",
|
||||
"open_local": "09:30",
|
||||
"close_local": "16:00",
|
||||
"lunch_break": false,
|
||||
"open_utc": "14:30",
|
||||
"close_utc": "21:00",
|
||||
"currency": "USD"
|
||||
},
|
||||
{
|
||||
"mic": "XSHG",
|
||||
"name": "Shanghai Stock Exchange",
|
||||
"country": "China",
|
||||
"city": "Shanghai",
|
||||
"market_cap_trillion_usd": 7.96,
|
||||
"timezone": "Asia/Shanghai",
|
||||
"tz_offset": "+08:00",
|
||||
"dst": null,
|
||||
"open_local": "09:30",
|
||||
"close_local": "15:00",
|
||||
"lunch_break": "11:30–13:00",
|
||||
"open_utc": "01:30",
|
||||
"close_utc": "07:00",
|
||||
"currency": "CNY"
|
||||
},
|
||||
{
|
||||
"mic": "XJPX",
|
||||
"name": "Japan Exchange Group (Tokyo Stock Exchange)",
|
||||
"country": "Japan",
|
||||
"city": "Tokyo",
|
||||
"market_cap_trillion_usd": 7.06,
|
||||
"timezone": "Asia/Tokyo",
|
||||
"tz_offset": "+09:00",
|
||||
"dst": null,
|
||||
"open_local": "09:00",
|
||||
"close_local": "15:00",
|
||||
"lunch_break": "11:30–12:30",
|
||||
"open_utc": "00:00",
|
||||
"close_utc": "06:00",
|
||||
"currency": "JPY"
|
||||
},
|
||||
{
|
||||
"mic": "XHKG",
|
||||
"name": "Hong Kong Stock Exchange",
|
||||
"country": "Hong Kong",
|
||||
"city": "Hong Kong",
|
||||
"market_cap_trillion_usd": 6.41,
|
||||
"timezone": "Asia/Hong_Kong",
|
||||
"tz_offset": "+08:00",
|
||||
"dst": null,
|
||||
"open_local": "09:30",
|
||||
"close_local": "16:00",
|
||||
"lunch_break": "12:00–13:00",
|
||||
"open_utc": "01:30",
|
||||
"close_utc": "08:00",
|
||||
"currency": "HKD"
|
||||
},
|
||||
{
|
||||
"mic": "XAMS",
|
||||
"name": "Euronext Amsterdam",
|
||||
"country": "Netherlands",
|
||||
"city": "Amsterdam",
|
||||
"market_cap_trillion_usd": 5.61,
|
||||
"timezone": "Europe/Amsterdam",
|
||||
"tz_offset": "+01:00",
|
||||
"dst": "Mar–Oct",
|
||||
"open_local": "09:00",
|
||||
"close_local": "17:30",
|
||||
"lunch_break": false,
|
||||
"open_utc": "08:00",
|
||||
"close_utc": "16:30",
|
||||
"currency": "EUR"
|
||||
},
|
||||
{
|
||||
"mic": "XBSE",
|
||||
"name": "Bombay Stock Exchange",
|
||||
"country": "India",
|
||||
"city": "Mumbai",
|
||||
"market_cap_trillion_usd": 5.25,
|
||||
"timezone": "Asia/Kolkata",
|
||||
"tz_offset": "+05:30",
|
||||
"dst": null,
|
||||
"open_local": "09:15",
|
||||
"close_local": "15:30",
|
||||
"lunch_break": false,
|
||||
"open_utc": "03:45",
|
||||
"close_utc": "10:00",
|
||||
"currency": "INR"
|
||||
},
|
||||
{
|
||||
"mic": "XNSE",
|
||||
"name": "National Stock Exchange of India",
|
||||
"country": "India",
|
||||
"city": "Mumbai",
|
||||
"market_cap_trillion_usd": 5.32,
|
||||
"timezone": "Asia/Kolkata",
|
||||
"tz_offset": "+05:30",
|
||||
"dst": null,
|
||||
"open_local": "09:15",
|
||||
"close_local": "15:d30",
|
||||
"lunch_break": false,
|
||||
"open_utc": "03:45",
|
||||
"close_utc": "10:00",
|
||||
"currency": "INR"
|
||||
},
|
||||
{
|
||||
"mic": "XSHE",
|
||||
"name": "Shenzhen Stock Exchange",
|
||||
"country": "China",
|
||||
"city": "Shenzhen",
|
||||
"market_cap_trillion_usd": 5.11,
|
||||
"timezone": "Asia/Shanghai",
|
||||
"tz_offset": "+08:00",
|
||||
"dst": null,
|
||||
"open_local": "09:30",
|
||||
"close_local": "15:00",
|
||||
"lunch_break": "11:30–13:00",
|
||||
"open_utc": "01:30",
|
||||
"close_utc": "07:00",
|
||||
"currency": "CNY"
|
||||
},
|
||||
{
|
||||
"mic": "XTSE",
|
||||
"name": "Toronto Stock Exchange",
|
||||
"country": "Canada",
|
||||
"city": "Toronto",
|
||||
"market_cap_trillion_usd": 4.00,
|
||||
"timezone": "America/Toronto",
|
||||
"tz_offset": "-05:00",
|
||||
"dst": "Mar–Nov",
|
||||
"open_local": "09:30",
|
||||
"close_local": "16:00",
|
||||
"lunch_break": false,
|
||||
"open_utc": "14:30",
|
||||
"close_utc": "21:00",
|
||||
"currency": "CAD"
|
||||
},
|
||||
{
|
||||
"mic": "XLON",
|
||||
"name": "London Stock Exchange",
|
||||
"country": "United Kingdom",
|
||||
"city": "London",
|
||||
"market_cap_trillion_usd": 3.14,
|
||||
"timezone": "Europe/London",
|
||||
"tz_offset": "+00:00",
|
||||
"dst": "Mar–Oct",
|
||||
"open_local": "08:00",
|
||||
"close_local": "16:30",
|
||||
"lunch_break": false,
|
||||
"open_utc": "08:00",
|
||||
"close_utc": "16:30",
|
||||
"currency": "GBP"
|
||||
},
|
||||
{
|
||||
"mic": "XTAI",
|
||||
"name": "Taiwan Stock Exchange",
|
||||
"country": "Taiwan",
|
||||
"city": "Taipei",
|
||||
"market_cap_trillion_usd": 2.87,
|
||||
"timezone": "Asia/Taipei",
|
||||
"tz_offset": "+08:00",
|
||||
"dst": null,
|
||||
"open_local": "09:00",
|
||||
"close_local": "13:30",
|
||||
"lunch_break": false,
|
||||
"open_utc": "01:00",
|
||||
"close_utc": "05:30",
|
||||
"currency": "TWD"
|
||||
},
|
||||
{
|
||||
"mic": "XSAU",
|
||||
"name": "Saudi Exchange (Tadawul)",
|
||||
"country": "Saudi Arabia",
|
||||
"city": "Riyadh",
|
||||
"market_cap_trillion_usd": 2.73,
|
||||
"timezone": "Asia/Riyadh",
|
||||
"tz_offset": "+03:00",
|
||||
"dst": null,
|
||||
"open_local": "10:00",
|
||||
"close_local": "15:00",
|
||||
"lunch_break": false,
|
||||
"open_utc": "07:00",
|
||||
"close_utc": "12:00",
|
||||
"currency": "SAR"
|
||||
},
|
||||
{
|
||||
"mic": "XFRA",
|
||||
"name": "Deutsche Börse (Xetra)",
|
||||
"country": "Germany",
|
||||
"city": "Frankfurt",
|
||||
"market_cap_trillion_usd": 2.04,
|
||||
"timezone": "Europe/Berlin",
|
||||
"tz_offset": "+01:00",
|
||||
"dst": "Mar–Oct",
|
||||
"open_local": "09:00",
|
||||
"close_local": "17:30",
|
||||
"lunch_break": false,
|
||||
"open_utc": "08:00",
|
||||
"close_utc": "16:30",
|
||||
"currency": "EUR"
|
||||
},
|
||||
{
|
||||
"mic": "XSWX",
|
||||
"name": "SIX Swiss Exchange",
|
||||
"country": "Switzerland",
|
||||
"city": "Zürich",
|
||||
"market_cap_trillion_usd": 1.97,
|
||||
"timezone": "Europe/Zurich",
|
||||
"tz_offset": "+01:00",
|
||||
"dst": "Mar–Oct",
|
||||
"open_local": "09:00",
|
||||
"close_local": "17:30",
|
||||
"lunch_break": false,
|
||||
"open_utc": "08:00",
|
||||
"close_utc": "16:30",
|
||||
"currency": "CHF"
|
||||
},
|
||||
{
|
||||
"mic": "XASX",
|
||||
"name": "Australian Securities Exchange",
|
||||
"country": "Australia",
|
||||
"city": "Sydney",
|
||||
"market_cap_trillion_usd": 1.89,
|
||||
"timezone": "Australia/Sydney",
|
||||
"tz_offset": "+10:00",
|
||||
"dst": "Oct–Apr",
|
||||
"open_local": "10:00",
|
||||
"close_local": "16:00",
|
||||
"lunch_break": false,
|
||||
"open_utc": "00:00",
|
||||
"close_utc": "06:00",
|
||||
"currency": "AUD"
|
||||
}
|
||||
]
|
||||
}
|
||||
6
src/data/index.txt
Normal file
6
src/data/index.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
data/*
|
||||
|
||||
companies.json
|
||||
continents.json
|
||||
countries.json
|
||||
exchanges.json
|
||||
60
src/economic/extraction_script.js
Normal file
60
src/economic/extraction_script.js
Normal file
@@ -0,0 +1,60 @@
|
||||
// src/economic/extraction_script.js
|
||||
const events = [];
|
||||
let currentDate = '';
|
||||
|
||||
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
|
||||
|
||||
for (let i = 0; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
const cells = row.querySelectorAll('td');
|
||||
|
||||
if (cells.length === 1 && cells[0].colSpan === 9) {
|
||||
const dateText = cells[0].textContent.trim();
|
||||
const monthMap = {
|
||||
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
|
||||
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
|
||||
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
|
||||
};
|
||||
const match = dateText.match(/(\d{1,2})\.\s+([a-zA-ZäöüßÄÖÜ]+)\s+(\d{4})/);
|
||||
if (match) {
|
||||
const day = match[1].padStart(2, '0');
|
||||
const month = monthMap[match[2]] || '01';
|
||||
const year = match[3];
|
||||
currentDate = `${year}-${month}-${day}`;
|
||||
} else {
|
||||
currentDate = '';
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells.length >= 8) {
|
||||
const time = cells[0]?.textContent?.trim() || '';
|
||||
const country = cells[2]?.textContent?.trim() || '';
|
||||
const eventName = cells[4]?.textContent?.trim() || '';
|
||||
if (!time || !country || !eventName) continue;
|
||||
|
||||
const yellowStars = cells[3]?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
|
||||
if (yellowStars !== 3) continue;
|
||||
|
||||
let description = '';
|
||||
if (i + 1 < rows.length) {
|
||||
const next = rows[i + 1];
|
||||
const descP = next.querySelector('p');
|
||||
if (descP) description = descP.textContent?.trim() || '';
|
||||
}
|
||||
|
||||
events.push({
|
||||
country,
|
||||
date: currentDate,
|
||||
time,
|
||||
event: eventName,
|
||||
actual: cells[7]?.textContent?.trim() || '',
|
||||
forecast: cells[6]?.textContent?.trim() || '',
|
||||
previous: cells[5]?.textContent?.trim() || '',
|
||||
importance: 'High',
|
||||
description
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return events;
|
||||
61
src/economic/helpers.rs
Normal file
61
src/economic/helpers.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
// src/economic/helpers.rs
|
||||
use super::types::*;
|
||||
use chrono::{Local, NaiveDate};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
pub fn event_key(e: &EconomicEvent) -> String {
|
||||
format!("{}|{}|{}", e.date, e.time, e.event)
|
||||
}
|
||||
|
||||
pub fn identity_key(e: &EconomicEvent) -> String {
|
||||
format!("{}|{}|{}", e.country, e.event, e.date.split('-').take(2).collect::<Vec<_>>().join("-"))
|
||||
}
|
||||
|
||||
pub fn build_identity_lookup(events: &HashMap<String, EconomicEvent>) -> HashMap<String, (String, EconomicEvent)> {
|
||||
let mut map = HashMap::new();
|
||||
for (k, e) in events {
|
||||
map.insert(identity_key(e), (k.clone(), e.clone()));
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
pub fn build_date_event_lookup(
|
||||
events: &HashMap<String, EconomicEvent>,
|
||||
) -> HashMap<String, Vec<(String, EconomicEvent)>> {
|
||||
let mut map: HashMap<String, Vec<(String, EconomicEvent)>> = HashMap::new();
|
||||
|
||||
for (k, e) in events {
|
||||
let key = format!("{}|{}|{}", e.country, e.event, e.date);
|
||||
map.entry(key).or_default().push((k.clone(), e.clone()));
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
pub fn detect_changes(old: &EconomicEvent, new: &EconomicEvent, today: &str) -> Vec<EventChange> {
|
||||
let mut changes = Vec::new();
|
||||
let ts = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
|
||||
|
||||
if new.date.as_str() <= today { return changes; }
|
||||
|
||||
let fields = [
|
||||
("actual", &old.actual, &new.actual),
|
||||
("forecast", &old.forecast, &new.forecast),
|
||||
("previous", &old.previous, &new.previous),
|
||||
("description", &old.description, &new.description),
|
||||
];
|
||||
|
||||
for (field, old_val, new_val) in fields {
|
||||
if old_val != new_val {
|
||||
changes.push(EventChange {
|
||||
date: new.date.clone(),
|
||||
event: new.event.clone(),
|
||||
country: new.country.clone(),
|
||||
field_changed: field.to_string(),
|
||||
old_value: old_val.clone(),
|
||||
new_value: new_val.clone(),
|
||||
detected_at: ts.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
changes
|
||||
}
|
||||
11
src/economic/mod.rs
Normal file
11
src/economic/mod.rs
Normal file
@@ -0,0 +1,11 @@
|
||||
// src/economic/mod.rs
|
||||
pub mod types;
|
||||
pub mod scraper;
|
||||
pub mod storage;
|
||||
pub mod update;
|
||||
pub mod helpers;
|
||||
|
||||
pub use types::*;
|
||||
pub use scraper::*;
|
||||
pub use update::run_full_update;
|
||||
pub use helpers::*;
|
||||
84
src/economic/scraper.rs
Normal file
84
src/economic/scraper.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
// src/economic/scraper.rs
|
||||
use super::types::{EconomicEvent, ScrapeResult};
|
||||
use fantoccini::Client;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use chrono::{Local, NaiveDate};
|
||||
|
||||
const EXTRACTION_JS: &str = include_str!("extraction_script.js");
|
||||
|
||||
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
||||
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
|
||||
dismiss_overlays(client).await?;
|
||||
|
||||
if let Ok(tab) = client.find(fantoccini::Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||
tab.click().await?;
|
||||
println!("High importance tab selected");
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
|
||||
for _ in 0..10 {
|
||||
let removed: bool = client
|
||||
.execute(
|
||||
r#"(() => {
|
||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||
if (iframe && iframe.parentNode) {
|
||||
iframe.parentNode.removeChild(iframe);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
})()"#,
|
||||
vec![],
|
||||
)
|
||||
.await?
|
||||
.as_bool()
|
||||
.unwrap_or(false);
|
||||
if removed { break; }
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||
let script = format!(
|
||||
r#"
|
||||
(() => {{
|
||||
const from = document.querySelector('#dtTeletraderFromDate');
|
||||
const to = document.querySelector('#dtTeletraderEndDate');
|
||||
if (from) {{ from.value = '{}'; from.dispatchEvent(new Event('change', {{bubbles: true}})); }}
|
||||
if (to) {{ to.value = '{}'; to.dispatchEvent(new Event('change', {{bubbles: true}})); }}
|
||||
return true;
|
||||
}})()
|
||||
"#,
|
||||
start, end
|
||||
);
|
||||
client.execute(&script, vec![]).await?;
|
||||
sleep(Duration::from_millis(1200)).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn extract_events(client: &Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
let result = client.execute(EXTRACTION_JS, vec![]).await?;
|
||||
let array = result.as_array().ok_or_else(|| anyhow::anyhow!("Expected array"))?;
|
||||
|
||||
let mut events = Vec::with_capacity(array.len());
|
||||
for val in array {
|
||||
if let Some(obj) = val.as_object() {
|
||||
events.push(EconomicEvent {
|
||||
country: obj["country"].as_str().unwrap_or("").to_string(),
|
||||
date: obj["date"].as_str().unwrap_or("").to_string(),
|
||||
time: obj["time"].as_str().unwrap_or("").to_string(),
|
||||
event: obj["event"].as_str().unwrap_or("").to_string(),
|
||||
actual: obj["actual"].as_str().unwrap_or("").to_string(),
|
||||
forecast: obj["forecast"].as_str().unwrap_or("").to_string(),
|
||||
previous: obj["previous"].as_str().unwrap_or("").to_string(),
|
||||
importance: "High".to_string(),
|
||||
description: obj["description"].as_str().unwrap_or("").to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
println!("Extracted {} high-impact events", events.len());
|
||||
Ok(events)
|
||||
}
|
||||
121
src/economic/storage.rs
Normal file
121
src/economic/storage.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
// src/economic/storage.rs
|
||||
use super::types::*;
|
||||
use super::helpers::*;
|
||||
use tokio::fs;
|
||||
use chrono::{Local, NaiveDate, Datelike};
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
||||
let dir = std::path::Path::new("economic_events");
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
if dir.exists() {
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
if name.starts_with("chunk_") {
|
||||
if let Some(content) = fs::read_to_string(&path).await.ok() {
|
||||
if let Ok(events) = serde_json::from_str::<Vec<EconomicEvent>>(&content) {
|
||||
let start = name[6..16].to_string();
|
||||
let end = name[17..27].to_string();
|
||||
chunks.push(ChunkInfo { start_date: start, end_date: end, path, event_count: events.len() });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
chunks.sort_by_key(|c| c.start_date.clone());
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMap<String, EconomicEvent>> {
|
||||
let mut map = HashMap::new();
|
||||
for chunk in chunks {
|
||||
let content = fs::read_to_string(&chunk.path).await?;
|
||||
let events: Vec<EconomicEvent> = serde_json::from_str(&content)?;
|
||||
for e in events {
|
||||
map.insert(event_key(&e), e);
|
||||
}
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
||||
let dir = std::path::Path::new("economic_events");
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
// Delete all old chunk files to prevent duplicates and overlaps
|
||||
println!("Removing old chunks...");
|
||||
|
||||
let mut entries = fs::read_dir(dir).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||
fs::remove_file(&path).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut sorted: Vec<_> = events.into_values().collect();
|
||||
sorted.sort_by_key(|e| e.date.clone());
|
||||
|
||||
let mut chunk: Vec<EconomicEvent> = Vec::new();
|
||||
const MAX_EVENTS_PER_CHUNK: usize = ( 30000 / 2 ) / 11; // (30000 - 2) / 11 = 2727
|
||||
|
||||
for e in sorted {
|
||||
if !chunk.is_empty() && chunk.len() >= MAX_EVENTS_PER_CHUNK {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
chunk.clear();
|
||||
}
|
||||
chunk.push(e);
|
||||
}
|
||||
if !chunk.is_empty() {
|
||||
save_chunk(&chunk, dir).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::Result<()> {
|
||||
let start = events.iter().map(|e| &e.date).min().unwrap().clone();
|
||||
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
||||
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
||||
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
||||
if changes.is_empty() { return Ok(()); }
|
||||
let dir = std::path::Path::new("economic_event_changes");
|
||||
fs::create_dir_all(dir).await?;
|
||||
|
||||
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
|
||||
for c in changes {
|
||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||
let key = format!("{:02}_{}", d.month(), d.year());
|
||||
by_month.entry(key).or_default().push(c.clone());
|
||||
}
|
||||
}
|
||||
|
||||
for (month, list) in by_month {
|
||||
let path = dir.join(format!("event_changes_{}.json", month));
|
||||
let mut all = if path.exists() {
|
||||
let s = fs::read_to_string(&path).await?;
|
||||
serde_json::from_str(&s).unwrap_or_default()
|
||||
} else { vec![] };
|
||||
all.extend(list);
|
||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn target_end_date() -> String {
|
||||
let now = Local::now().naive_local().date();
|
||||
let future = now + chrono::Duration::days(90);
|
||||
future.format("%Y-%m-%d").to_string()
|
||||
}
|
||||
40
src/economic/types.rs
Normal file
40
src/economic/types.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
// src/economic/types.rs
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct EconomicEvent {
|
||||
pub country: String,
|
||||
pub date: String, // YYYY-MM-DD
|
||||
pub time: String,
|
||||
pub event: String,
|
||||
pub actual: String,
|
||||
pub forecast: String,
|
||||
pub previous: String,
|
||||
pub importance: String,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct EventChange {
|
||||
pub date: String,
|
||||
pub event: String,
|
||||
pub country: String,
|
||||
pub field_changed: String,
|
||||
pub old_value: String,
|
||||
pub new_value: String,
|
||||
pub detected_at: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ChunkInfo {
|
||||
pub start_date: String,
|
||||
pub end_date: String,
|
||||
pub path: std::path::PathBuf,
|
||||
pub event_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ScrapeResult {
|
||||
pub changes: Vec<EventChange>,
|
||||
pub removed_keys: std::collections::HashSet<String>,
|
||||
}
|
||||
102
src/economic/update.rs
Normal file
102
src/economic/update.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
// src/economic/update.rs
|
||||
use super::{scraper::*, storage::*, helpers::*, types::*};
|
||||
use crate::config::Config;
|
||||
use chrono::{Local, NaiveDate};
|
||||
|
||||
pub async fn run_full_update(client: &fantoccini::Client, config: &Config) -> anyhow::Result<()> {
|
||||
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
||||
let end_date = config.target_end_date();
|
||||
|
||||
let chunks = scan_existing_chunks().await?;
|
||||
let mut events = load_existing_events(&chunks).await?;
|
||||
println!("Loaded {} events from {} chunks", events.len(), chunks.len());
|
||||
|
||||
let start_date = if events.is_empty() {
|
||||
config.economic_start_date.clone()
|
||||
} else if events.values().any(|e| e.date >= today_str) {
|
||||
today_str.clone()
|
||||
} else {
|
||||
events.values()
|
||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
.max()
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or(today_str.clone())
|
||||
};
|
||||
|
||||
println!("Scraping economic events: {} → {}", start_date, end_date);
|
||||
|
||||
let mut current = start_date;
|
||||
let mut total_changes = 0;
|
||||
|
||||
while current <= end_date {
|
||||
set_date_range(client, ¤t, &end_date).await?;
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
||||
|
||||
let new_events = extract_events(client).await?;
|
||||
if new_events.is_empty() { break; }
|
||||
|
||||
let result = process_batch(&new_events, &mut events, &today_str);
|
||||
total_changes += result.changes.len();
|
||||
save_changes(&result.changes).await?;
|
||||
|
||||
let next = new_events.iter()
|
||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||
.max()
|
||||
.and_then(|d| d.succ_opt())
|
||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||
.unwrap_or(end_date.clone());
|
||||
|
||||
if next > end_date { break; }
|
||||
current = next;
|
||||
}
|
||||
|
||||
save_optimized_chunks(events).await?;
|
||||
println!("Economic update complete — {} changes detected", total_changes);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn process_batch(
|
||||
new_events: &[EconomicEvent],
|
||||
existing: &mut std::collections::HashMap<String, EconomicEvent>,
|
||||
today: &str,
|
||||
) -> ScrapeResult {
|
||||
let mut changes = Vec::new();
|
||||
let mut removed = std::collections::HashSet::new();
|
||||
|
||||
let identity_map = build_identity_lookup(existing);
|
||||
let date_map = build_date_event_lookup(existing);
|
||||
|
||||
for new in new_events {
|
||||
let key = event_key(new);
|
||||
|
||||
if let Some(old) = existing.get(&key) {
|
||||
changes.extend(detect_changes(old, new, today));
|
||||
existing.insert(key, new.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
let date_key = format!("{}|{}|{}", new.country, new.event, new.date);
|
||||
if let Some(occurrences) = date_map.get(&date_key) {
|
||||
if let Some((old_key, old_event)) = occurrences.iter().find(|(k, _)| *k != key) {
|
||||
if new.date.as_str() > today {
|
||||
changes.push(EventChange {
|
||||
date: new.date.clone(),
|
||||
event: new.event.clone(),
|
||||
country: new.country.clone(),
|
||||
field_changed: "time".to_string(),
|
||||
old_value: old_event.time.clone(),
|
||||
new_value: new.time.clone(),
|
||||
detected_at: Local::now().format("%Y-%m-%d %H:%M:%S").to_string(),
|
||||
});
|
||||
}
|
||||
removed.insert(old_key.clone());
|
||||
existing.remove(old_key);
|
||||
}
|
||||
}
|
||||
|
||||
existing.insert(key, new.clone());
|
||||
}
|
||||
|
||||
ScrapeResult { changes, removed_keys: removed }
|
||||
}
|
||||
71
src/main.rs
Normal file
71
src/main.rs
Normal file
@@ -0,0 +1,71 @@
|
||||
// src/main.rs
|
||||
mod economic;
|
||||
mod corporate;
|
||||
mod config;
|
||||
mod util;
|
||||
|
||||
use fantoccini::{ClientBuilder};
|
||||
use serde_json::{Map, Value};
|
||||
use tokio::signal;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
// === Ensure data directories exist ===
|
||||
util::ensure_data_dirs().await?;
|
||||
|
||||
// === Load configuration ===
|
||||
let config = config::Config::default();
|
||||
|
||||
// === Start ChromeDriver ===
|
||||
let mut child = std::process::Command::new("chromedriver-win64/chromedriver.exe")
|
||||
.args(["--port=9515"]) // Level 3 = minimal logs
|
||||
.spawn()?;
|
||||
|
||||
// Build capabilities to hide infobar + enable full rendering
|
||||
let port = 9515;
|
||||
let caps_value = serde_json::json!({
|
||||
"goog:chromeOptions": {
|
||||
"args": [
|
||||
//"--headless",
|
||||
"--disable-gpu",
|
||||
"--disable-notifications",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-blink-features=AutomationControlled"
|
||||
],
|
||||
"excludeSwitches": ["enable-automation"]
|
||||
}
|
||||
});
|
||||
|
||||
let caps_map: Map<String, Value> = caps_value.as_object()
|
||||
.expect("Capabilities should be a JSON object")
|
||||
.clone();
|
||||
|
||||
let mut client = ClientBuilder::native()
|
||||
.capabilities(caps_map)
|
||||
.connect(&format!("http://localhost:{}", port))
|
||||
.await?;
|
||||
|
||||
// Graceful shutdown
|
||||
let client_clone = client.clone();
|
||||
tokio::spawn(async move {
|
||||
signal::ctrl_c().await.unwrap();
|
||||
client_clone.close().await.ok();
|
||||
std::process::exit(0);
|
||||
});
|
||||
|
||||
// === Economic Calendar Update ===
|
||||
println!("Updating Economic Calendar (High Impact Only)");
|
||||
economic::goto_and_prepare(&client).await?;
|
||||
economic::run_full_update(&client, &config).await?;
|
||||
|
||||
// === Corporate Earnings Update ===
|
||||
println!("\nUpdating Corporate Earnings");
|
||||
corporate::run_full_update(&client, &config).await?;
|
||||
|
||||
// === Cleanup ===
|
||||
client.close().await?;
|
||||
child.kill()?;
|
||||
|
||||
println!("\nAll data updated successfully!");
|
||||
Ok(())
|
||||
}
|
||||
22
src/util.rs
Normal file
22
src/util.rs
Normal file
@@ -0,0 +1,22 @@
|
||||
// src/util.rs (or put it directly in main.rs if you prefer)
|
||||
use tokio::fs;
|
||||
use std::path::Path;
|
||||
|
||||
/// Create the required data folders if they do not exist yet.
|
||||
pub async fn ensure_data_dirs() -> anyhow::Result<()> {
|
||||
let dirs = [
|
||||
"economic_events",
|
||||
"economic_event_changes",
|
||||
"corporate_events",
|
||||
"corporate_prices",
|
||||
"data",
|
||||
];
|
||||
for dir in dirs {
|
||||
let path = Path::new(dir);
|
||||
if !path.exists() {
|
||||
tokio::fs::create_dir_all(path).await?;
|
||||
println!("Created directory: {dir}");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user