diff --git a/.gitignore b/.gitignore index 31b6b3c..d630def 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,5 @@ target/ /economic_event_changes* /corporate_events* /corporate_prices* -/corporate_event_changes* \ No newline at end of file +/corporate_event_changes* +/data* \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 2bfdba3..9778843 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,17 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "ahash" version = "0.7.8" @@ -71,6 +82,15 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "arrayvec" version = "0.7.6" @@ -228,6 +248,15 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + [[package]] name = "cc" version = "1.2.46" @@ -235,6 +264,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -275,6 +306,16 @@ dependencies = [ "serde", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "compression-codecs" version = "0.4.32" @@ -293,6 +334,12 @@ version = "0.4.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a9b614a5787ef0c8802a55766480563cb3a93b435898c422ed2a359cf811582" +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "cookie" version = "0.16.2" @@ -378,6 +425,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.5.0" @@ -445,12 +507,39 @@ dependencies = [ "syn 2.0.110", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "data-encoding" version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" +[[package]] +name = "deflate64" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26bf8fc351c5ed29b5c2f0cbbac1b209b74f60ecd62e675a998df72c49af5204" + [[package]] name = "deranged" version = "0.5.5" @@ -460,6 +549,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.110", +] + [[package]] name = "derive_more" version = "0.99.20" @@ -479,6 +579,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", + "subtle", ] [[package]] @@ -559,7 +660,9 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "csv", "fantoccini", + "flate2", "futures", "rayon", "reqwest", @@ -570,6 +673,7 @@ dependencies = [ "tracing", "tracing-subscriber", "yfinance-rs", + "zip", ] [[package]] @@ -621,6 +725,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", + "libz-rs-sys", "miniz_oxide", ] @@ -854,6 +959,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "html5ever" version = "0.27.0" @@ -1205,6 +1319,15 @@ dependencies = [ "hashbrown 0.16.1", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -1262,6 +1385,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + [[package]] name = "js-sys" version = "0.3.82" @@ -1278,12 +1411,27 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +[[package]] +name = "libz-rs-sys" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" +dependencies = [ + "zlib-rs", +] + [[package]] name = "linux-raw-sys" version = "0.11.0" @@ -1323,6 +1471,16 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lzma-rust2" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c60a23ffb90d527e23192f1246b14746e2f7f071cb84476dd879071696c18a4a" +dependencies = [ + "crc", + "sha2", +] + [[package]] name = "mac" version = "0.1.1" @@ -1624,6 +1782,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -1781,6 +1949,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppmd-rust" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d558c559f0450f16f2a27a1f017ef38468c1090c9ce63c8e51366232d53717b4" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -2126,6 +2300,7 @@ dependencies = [ "cookie 0.18.1", "cookie_store", "encoding_rs", + "futures-channel", "futures-core", "futures-util", "h2", @@ -2520,6 +2695,17 @@ dependencies = [ "digest", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -3645,6 +3831,20 @@ name = "zeroize" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.110", +] [[package]] name = "zerotrie" @@ -3678,3 +3878,76 @@ dependencies = [ "quote", "syn 2.0.110", ] + +[[package]] +name = "zip" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2a05c7c36fde6c09b08576c9f7fb4cda705990f73b58fe011abf7dfb24168b" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq", + "crc32fast", + "deflate64", + "flate2", + "getrandom 0.3.4", + "hmac", + "indexmap", + "lzma-rust2", + "memchr", + "pbkdf2", + "ppmd-rust", + "sha1", + "time", + "zeroize", + "zopfli", + "zstd", +] + +[[package]] +name = "zlib-rs" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" + +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index c59a710..7946aae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ categories = ["finance", "data-structures", "asynchronous"] tokio = { version = "1.38", features = ["full"] } # Web scraping & HTTP -reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate"] } +reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "blocking"] } scraper = "0.19" # HTML parsing for Yahoo earnings pages fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net yfinance-rs = "0.7.2" @@ -25,6 +25,9 @@ yfinance-rs = "0.7.2" # Serialization serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +csv = "1.3" +zip = "6.0.0" +flate2 = "1.1.5" # Date & time chrono = { version = "0.4", features = ["serde"] } diff --git a/src/corporate/aggregation.rs b/src/corporate/aggregation.rs index 67393c0..0caf6dc 100644 --- a/src/corporate/aggregation.rs +++ b/src/corporate/aggregation.rs @@ -16,8 +16,8 @@ struct DayData { } /// Aggregate price data from multiple exchanges, converting all to USD -pub async fn aggregate_best_price_data(isin: &str) -> anyhow::Result<()> { - let company_dir = get_company_dir(isin); +pub async fn aggregate_best_price_data(lei: &str) -> anyhow::Result<()> { + let company_dir = get_company_dir(lei); for timeframe in ["daily", "5min"].iter() { let source_dir = company_dir.join(timeframe); @@ -136,7 +136,7 @@ pub async fn aggregate_best_price_data(isin: &str) -> anyhow::Result<()> { .unwrap_or_else(|| "unknown".to_string()); aggregated.push(CompanyPrice { - ticker: format!("{}@agg", isin), // Mark as aggregated + ticker: format!("{lei}@agg"), // Mark as aggregated date, time, open: data.open, @@ -159,7 +159,7 @@ pub async fn aggregate_best_price_data(isin: &str) -> anyhow::Result<()> { // Save aggregation metadata let meta = AggregationMetadata { - isin: isin.to_string(), + lei: lei.to_string(), // ← CHANGE THIS timeframe: timeframe.to_string(), sources: sources_used.into_iter().collect(), total_bars: aggregated.len(), @@ -169,7 +169,7 @@ pub async fn aggregate_best_price_data(isin: &str) -> anyhow::Result<()> { ), aggregated_at: chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(), }; - + let meta_path = agg_dir.join("metadata.json"); fs::write(&meta_path, serde_json::to_string_pretty(&meta)?).await?; @@ -185,7 +185,7 @@ pub async fn aggregate_best_price_data(isin: &str) -> anyhow::Result<()> { #[derive(Debug, serde::Serialize, serde::Deserialize)] struct AggregationMetadata { - isin: String, + lei: String, timeframe: String, sources: Vec, total_bars: usize, diff --git a/src/corporate/helpers.rs b/src/corporate/helpers.rs index fdde549..066d437 100644 --- a/src/corporate/helpers.rs +++ b/src/corporate/helpers.rs @@ -57,4 +57,14 @@ pub fn price_key(p: &CompanyPrice) -> String { } else { format!("{}|{}|{}", p.ticker, p.date, p.time) } +} + +pub fn parse_float(s: &str) -> Option { + s.replace("--", "").replace(",", "").parse::().ok() +} + +pub fn parse_yahoo_date(s: &str) -> anyhow::Result { + NaiveDate::parse_from_str(s, "%B %d, %Y") + .or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y")) + .map_err(|_| anyhow::anyhow!("Bad date: {s}")) } \ No newline at end of file diff --git a/src/corporate/scraper.rs b/src/corporate/scraper.rs index 686aef7..b2a64b5 100644 --- a/src/corporate/scraper.rs +++ b/src/corporate/scraper.rs @@ -1,11 +1,16 @@ // src/corporate/scraper.rs -use super::types::{CompanyEvent, CompanyPrice, TickerInfo}; +use super::{types::{CompanyEvent, CompanyPrice, TickerInfo}, helpers::*}; +use csv::ReaderBuilder; use fantoccini::{Client, Locator}; use scraper::{Html, Selector}; use chrono::{DateTime, Duration, NaiveDate, Timelike, Utc}; -use tokio::time::{sleep, Duration as TokioDuration}; +use tokio::{time::{Duration as TokioDuration, sleep}}; use reqwest::Client as HttpClient; use serde_json::Value; +use zip::ZipArchive; +use std::fs::File; +use std::{collections::HashMap}; +use std::io::{Read, BufReader}; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; @@ -136,8 +141,8 @@ async fn check_ticker_exists(ticker: &str) -> anyhow::Result { Ok(TickerInfo { ticker: ticker.to_string(), exchange_mic, - currency, - primary: false, // Will be set separately + currency: currency.to_string(), + primary: false, }) } @@ -418,12 +423,170 @@ pub async fn fetch_price_history_5min( Ok(prices) } -fn parse_float(s: &str) -> Option { - s.replace("--", "").replace(",", "").parse::().ok() +/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF +/// Overengineered; we could just use the static URL, but this shows how to scrape if needed +pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result { + let url = format!("https://www.gleif.org/de/lei-data/lei-mapping/download-isin-to-lei-relationship-files"); + client.goto(&url).await?; + + let html = client.source().await?; + let _document = Html::parse_document(&html); + let _row_sel = Selector::parse("table tbody tr").unwrap(); + let isin_lei = "".to_string(); + + Ok(isin_lei) } -fn parse_yahoo_date(s: &str) -> anyhow::Result { - NaiveDate::parse_from_str(s, "%B %d, %Y") - .or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y")) - .map_err(|_| anyhow::anyhow!("Bad date: {s}")) +pub async fn download_isin_lei_csv() -> anyhow::Result> { + let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download"; + let zip_path = "data/isin_lei.zip"; + let csv_path = "data/isin_lei.csv"; + + if let Err(e) = std::fs::create_dir_all("data") { + println!("Failed to create data directory: {e}"); + return Ok(None); + } + + // Download ZIP + let bytes = match reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(std::time::Duration::from_secs(30)) + .build() + .and_then(|c| Ok(c)) + { + Ok(client) => match client.get(url).send().await { + Ok(resp) if resp.status().is_success() => match resp.bytes().await { + Ok(b) => b, + Err(e) => { + println!("Failed to read ZIP bytes: {e}"); + return Ok(None); + } + }, + Ok(resp) => { + println!("Server returned HTTP {}", resp.status()); + return Ok(None); + } + Err(e) => { + println!("Failed to download ISIN/LEI ZIP: {e}"); + return Ok(None); + } + }, + Err(e) => { + println!("Failed to create HTTP client: {e}"); + return Ok(None); + } + }; + + if let Err(e) = tokio::fs::write(zip_path, &bytes).await { + println!("Failed to write ZIP file: {e}"); + return Ok(None); + } + + // Extract CSV + let archive = match std::fs::File::open(zip_path) + .map(ZipArchive::new) + { + Ok(Ok(a)) => a, + Ok(Err(e)) => { + println!("Invalid ZIP: {e}"); + return Ok(None); + } + Err(e) => { + println!("Cannot open ZIP file: {e}"); + return Ok(None); + } + }; + + let mut archive = archive; + + let idx = match (0..archive.len()).find(|&i| { + archive.by_index(i) + .map(|f| f.name().ends_with(".csv")) + .unwrap_or(false) + }) { + Some(i) => i, + None => { + println!("ZIP did not contain a CSV file"); + return Ok(None); + } + }; + + let mut csv_file = match archive.by_index(idx) { + Ok(f) => f, + Err(e) => { + println!("Failed to read CSV entry: {e}"); + return Ok(None); + } + }; + + let mut csv_bytes = Vec::new(); + if let Err(e) = csv_file.read_to_end(&mut csv_bytes) { + println!("Failed to extract CSV: {e}"); + return Ok(None); + } + + if let Err(e) = tokio::fs::write(csv_path, &csv_bytes).await { + println!("Failed to save CSV file: {e}"); + return Ok(None); + } + + Ok(Some(csv_path.to_string())) +} + + +pub fn load_isin_lei_csv() -> anyhow::Result>> { + let rt = tokio::runtime::Runtime::new(); + + let Some(path) = + (match rt { + Ok(rt) => match rt.block_on(download_isin_lei_csv()) { + Ok(Some(p)) => Some(p), + Ok(None) => { + println!("ISIN/LEI download failed; continuing with empty map"); + None + } + Err(e) => { + println!("Runtime download error: {e}"); + None + } + }, + Err(e) => { + println!("Failed to create Tokio runtime: {e}"); + None + } + } + ) else { + return Ok(HashMap::new()); + }; + + let file = match File::open(&path) { + Ok(f) => f, + Err(e) => { + println!("Cannot open CSV '{}': {e}", path); + return Ok(HashMap::new()); + } + }; + + let mut rdr = ReaderBuilder::new().from_reader(BufReader::new(file)); + let mut map: HashMap> = HashMap::new(); + + for row in rdr.records() { + let rec = match row { + Ok(r) => r, + Err(e) => { + println!("CSV parse error: {e}"); + continue; + } + }; + + if rec.len() < 2 { + continue; + } + + let lei = rec[0].to_string(); + let isin = rec[1].to_string(); + map.entry(lei).or_default().push(isin); + } + + Ok(map) } \ No newline at end of file diff --git a/src/corporate/storage.rs b/src/corporate/storage.rs index 5aa8aac..03df2d6 100644 --- a/src/corporate/storage.rs +++ b/src/corporate/storage.rs @@ -111,8 +111,8 @@ pub async fn load_companies() -> Result, anyhow::Error> { Ok(companies) } -pub fn get_company_dir(isin: &str) -> PathBuf { - PathBuf::from("corporate_prices").join(isin) +pub fn get_company_dir(lei: &str) -> PathBuf { + PathBuf::from("corporate_prices").join(lei) } pub async fn ensure_company_dirs(isin: &str) -> anyhow::Result<()> { @@ -131,13 +131,19 @@ pub async fn ensure_company_dirs(isin: &str) -> anyhow::Result<()> { } pub async fn save_company_metadata(company: &CompanyMetadata) -> anyhow::Result<()> { - let dir = get_company_dir(&company.isin); + let dir = get_company_dir(&company.lei); fs::create_dir_all(&dir).await?; let path = dir.join("metadata.json"); - fs::write(path, serde_json::to_string_pretty(company)?).await?; + fs::write(&path, serde_json::to_string_pretty(company)?).await?; Ok(()) } +pub async fn load_company_metadata(lei: &str) -> anyhow::Result { + let path = get_company_dir(lei).join("metadata.json"); + let content = fs::read_to_string(path).await?; + Ok(serde_json::from_str(&content)?) +} + pub async fn save_available_exchanges(isin: &str, exchanges: Vec) -> anyhow::Result<()> { let dir = get_company_dir(isin); fs::create_dir_all(&dir).await?; @@ -146,8 +152,8 @@ pub async fn save_available_exchanges(isin: &str, exchanges: Vec anyhow::Result> { - let path = get_company_dir(isin).join("available_exchanges.json"); +pub async fn load_available_exchanges(lei: &str) -> anyhow::Result> { + let path = get_company_dir(lei).join("available_exchanges.json"); if path.exists() { let content = fs::read_to_string(&path).await?; Ok(serde_json::from_str(&content)?) @@ -157,19 +163,17 @@ pub async fn load_available_exchanges(isin: &str) -> anyhow::Result, ) -> anyhow::Result<()> { let source_safe = source_ticker.replace(".", "_").replace("/", "_"); - let dir = get_company_dir(isin).join(timeframe).join(&source_safe); + let dir = get_company_dir(lei).join(timeframe).join(&source_safe); fs::create_dir_all(&dir).await?; let path = dir.join("prices.json"); - let mut prices = prices; prices.sort_by_key(|p| (p.date.clone(), p.time.clone())); - fs::write(&path, serde_json::to_string_pretty(&prices)?).await?; Ok(()) } diff --git a/src/corporate/types.rs b/src/corporate/types.rs index 9d3c091..0a4fe6b 100644 --- a/src/corporate/types.rs +++ b/src/corporate/types.rs @@ -49,8 +49,10 @@ pub struct TickerInfo { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CompanyMetadata { - pub isin: String, + pub lei: String, // e.g. "5493000J2N45DDNE4Y28" pub name: String, + pub isins: Vec, // All ISINs belonging to this legal entity (primary + ADR + GDR) + pub primary_isin: String, // The most liquid / preferred one (used for folder fallback) pub tickers: Vec, } diff --git a/src/corporate/update.rs b/src/corporate/update.rs index df68de3..b6e34bb 100644 --- a/src/corporate/update.rs +++ b/src/corporate/update.rs @@ -6,122 +6,137 @@ use chrono::Local; use std::collections::HashMap; pub async fn run_full_update(client: &fantoccini::Client, config: &Config) -> anyhow::Result<()> { - println!("Starting company-centric corporate update (ISIN-based)"); + println!("Starting LEI-based corporate update"); + + // 1. Download fresh GLEIF ISIN↔LEI mapping on every run + let lei_to_isins: HashMap> = match load_isin_lei_csv() { + Ok(map) => map, + Err(e) => { + println!("Warning: Failed to load ISIN↔LEI mapping: {}", e); + HashMap::new() + } + }; + //let _isin_to_lei = load_isin_to_lei()?; // optional, useful for migration scripts - let companies = load_companies().await?; let today = chrono::Local::now().format("%Y-%m-%d").to_string(); - let mut existing_events = load_existing_events().await?; - for company in companies { - println!("\nProcessing company: {} ({})", company.name, company.isin); + let companies = load_companies().await?; // Vec with lei, isins, tickers - ensure_company_dirs(&company.isin).await?; - save_company_metadata(&company).await?; + for mut company in companies { + println!("\nProcessing company: {} (LEI: {})", company.name, company.lei); - // === STEP 1: Discover all available exchanges === - let mut all_tickers = company.tickers.clone(); - - // Try to discover additional exchanges using the primary ticker - if let Some(primary_ticker) = company.tickers.iter().find(|t| t.primary) { - println!(" πŸ” Discovering additional exchanges..."); - match discover_available_exchanges(&company.isin, &primary_ticker.ticker).await { - Ok(discovered) => { - // Merge discovered tickers with existing ones - for disc in discovered { - if !all_tickers.iter().any(|t| t.ticker == disc.ticker) { - println!(" βœ“ Found new exchange: {} ({})", disc.ticker, disc.exchange_mic); - all_tickers.push(disc); - } - } + // === Enrich with ALL ISINs known to GLEIF (includes ADRs, GDRs, etc.) === + if let Some(all_isins) = lei_to_isins.get(&company.lei) { + let mut seen = company.isins.iter().cloned().collect::>(); + for isin in all_isins { + if !seen.contains(isin) { + company.isins.push(isin.clone()); + seen.insert(isin.clone()); } - Err(e) => println!(" ⚠ Discovery failed: {}", e), } } - // Update metadata with newly discovered tickers - if all_tickers.len() > company.tickers.len() { - let updated_company = CompanyMetadata { - isin: company.isin.clone(), - name: company.name.clone(), - tickers: all_tickers.clone(), - }; - save_company_metadata(&updated_company).await?; - println!(" πŸ“ Updated metadata with {} total tickers", all_tickers.len()); + // Ensure company directory exists (now uses LEI) + ensure_company_dirs(&company.lei).await?; + save_company_metadata(&company).await?; + + // === STEP 1: Discover additional exchanges using each known ISIN === + let mut all_tickers = company.tickers.clone(); + + if let Some(primary_ticker) = company.tickers.iter().find(|t| t.primary) { + println!(" Discovering additional exchanges across {} ISIN(s)...", company.isins.len()); + + for isin in &company.isins { + println!(" β†’ Checking ISIN: {}", isin); + match discover_available_exchanges(isin, &primary_ticker.ticker).await { + Ok(discovered) => { + if discovered.is_empty() { + println!(" – No new exchanges found for {}", isin); + } else { + for disc in discovered { + if !all_tickers.iter().any(|t| t.ticker == disc.ticker && t.exchange_mic == disc.exchange_mic) { + println!(" Found new listing: {} ({}) [ISIN: {}]", disc.ticker, disc.exchange_mic, isin); + all_tickers.push(disc); + } + } + } + } + Err(e) => println!(" Discovery failed for {}: {}", isin, e), + } + tokio::time::sleep(tokio::time::Duration::from_millis(300)).await; + } } - // === STEP 2: Fetch data from all available exchanges === + // Save updated metadata if we found new listings + if all_tickers.len() > company.tickers.len() { + company.tickers = all_tickers.clone(); + save_company_metadata(&company).await?; + println!(" Updated metadata: {} total tickers", all_tickers.len()); + } + + // === STEP 2: Fetch data from ALL available tickers === for ticker_info in &all_tickers { let ticker = &ticker_info.ticker; - println!(" β†’ Trying ticker: {} ({})", ticker, ticker_info.exchange_mic); + println!(" β†’ Fetching: {} ({})", ticker, ticker_info.exchange_mic); let mut daily_success = false; let mut intraday_success = false; - // Earnings (only from primary ticker to avoid duplicates) + // Earnings: only fetch from primary ticker to avoid duplicates if ticker_info.primary { if let Ok(new_events) = fetch_earnings_history(client, ticker).await { let result = process_batch(&new_events, &mut existing_events, &today); save_changes(&result.changes).await?; - println!(" βœ“ {} earnings events", new_events.len()); + println!(" Earnings events: {}", new_events.len()); } } // Daily prices - match fetch_daily_price_history(ticker, &config.corporate_start_date, &today).await { - Ok(prices) => { - if !prices.is_empty() { - save_prices_by_source(&company.isin, ticker, "daily", prices.clone()).await?; - daily_success = true; - println!(" βœ“ Saved {} daily bars ({} currency)", - prices.len(), - prices.first().map(|p| p.currency.as_str()).unwrap_or("?") - ); - } + if let Ok(prices) = fetch_daily_price_history(ticker, &config.corporate_start_date, &today).await { + if !prices.is_empty() { + save_prices_by_source(&company.lei, ticker, "daily", prices).await?; + daily_success = true; } - Err(e) => println!(" βœ— Daily fetch failed: {}", e), } - // 5-minute prices (last 60 days) + // 5-minute intraday (last 60 days) let sixty_days_ago = (chrono::Local::now() - chrono::Duration::days(60)) .format("%Y-%m-%d") .to_string(); - match fetch_price_history_5min(ticker, &sixty_days_ago, &today).await { - Ok(prices) => { - if !prices.is_empty() { - save_prices_by_source(&company.isin, ticker, "5min", prices.clone()).await?; - intraday_success = true; - println!(" βœ“ Saved {} 5min bars", prices.len()); - } + if let Ok(prices) = fetch_price_history_5min(ticker, &sixty_days_ago, &today).await { + if !prices.is_empty() { + save_prices_by_source(&company.lei, ticker, "5min", prices).await?; + intraday_success = true; } - Err(e) => println!(" βœ— 5min fetch failed: {}", e), } - // Record success in available_exchanges.json - if daily_success || intraday_success { - update_available_exchange( - &company.isin, - ticker, - &ticker_info.exchange_mic, - daily_success, - intraday_success, - ).await?; - } + // Update available_exchanges.json (now under LEI folder) + update_available_exchange( + &company.lei, + ticker, + &ticker_info.exchange_mic, + daily_success, + intraday_success, + ).await?; tokio::time::sleep(tokio::time::Duration::from_millis(800)).await; } - // === STEP 3: Aggregate prices from all sources === - println!(" πŸ“Š Aggregating multi-exchange data with FX conversion..."); - match aggregate_best_price_data(&company.isin).await { - Ok(_) => println!(" βœ“ Aggregation complete"), - Err(e) => println!(" ⚠ Aggregation warning: {}", e), + // === STEP 3: Aggregate all sources into unified USD prices === + println!(" Aggregating multi-source price data (FX-adjusted)..."); + if let Err(e) = aggregate_best_price_data(&company.lei).await { + println!(" Aggregation failed: {}", e); + } else { + println!(" Aggregation complete"); } } + // Final save of optimized earnings events save_optimized_events(existing_events).await?; - println!("\nβœ… Corporate update complete (ISIN-based)"); + println!("\nCorporate update complete (LEI-based)"); + Ok(()) } diff --git a/src/util.rs b/src/util.rs index 27e9832..6da8750 100644 --- a/src/util.rs +++ b/src/util.rs @@ -9,15 +9,14 @@ pub async fn ensure_data_dirs() -> anyhow::Result<()> { "economic_event_changes", "corporate_events", "corporate_prices", + "data", ]; - for dir in dirs { let path = Path::new(dir); if !path.exists() { - fs::create_dir_all(path).await?; + tokio::fs::create_dir_all(path).await?; println!("Created directory: {dir}"); } - // else β†’ silently continue } Ok(()) } \ No newline at end of file