// src/corporate/scraper.rs use super::{types::*}; //use crate::corporate::openfigi::OpenFigiClient; use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger}; use fantoccini::{Client}; use scraper::{Html, Selector}; use chrono::{DateTime, Duration, NaiveDate, Utc}; use tokio::{time::{Duration as TokioDuration, sleep}}; use reqwest::Client as HttpClient; use serde_json::{json, Value}; use zip::ZipArchive; use std::{collections::HashMap}; use std::io::{Read}; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; fn parse_price(v: Option<&Value>) -> f64 { v.and_then(|x| x.as_str()) .and_then(|s| s.replace('$', "").replace(',', "").parse::().ok()) .or_else(|| v.and_then(|x| x.as_f64())) .unwrap_or(0.0) } fn parse_volume(v: Option<&Value>) -> u64 { v.and_then(|x| x.as_str()) .and_then(|s| s.replace(',', "").parse::().ok()) .or_else(|| v.and_then(|x| x.as_u64())) .unwrap_or(0) } pub async fn fetch_daily_price_history( ticker: &str, start_str: &str, end_str: &str, ) -> anyhow::Result> { let start = NaiveDate::parse_from_str(start_str, "%Y-%m-%d")?; let end = NaiveDate::parse_from_str(end_str, "%Y-%m-%d")? + Duration::days(1); let mut all_prices = Vec::new(); let mut current = start; while current < end { let chunk_end = current + Duration::days(730); let actual_end = chunk_end.min(end); let period1 = current.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp(); let period2 = actual_end.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp(); println!(" Fetching {ticker} {} → {}", current, actual_end - Duration::days(1)); let url = format!( "https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=1d&includeAdjustedClose=true" ); let json: Value = HttpClient::new() .get(&url) .header("User-Agent", USER_AGENT) .send() .await? .json() .await?; let result = &json["chart"]["result"][0]; let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?; let quote = &result["indicators"]["quote"][0]; let meta = &result["meta"]; let currency = meta["currency"].as_str().unwrap_or("USD").to_string(); let opens = quote["open"].as_array(); let highs = quote["high"].as_array(); let lows = quote["low"].as_array(); let closes = quote["close"].as_array(); let adj_closes = result["indicators"]["adjclose"][0]["adjclose"].as_array() .or_else(|| closes); let volumes = quote["volume"].as_array(); for (i, ts_val) in timestamps.iter().enumerate() { let ts = ts_val.as_i64().unwrap_or(0); let dt: DateTime = DateTime::from_timestamp(ts, 0).unwrap_or_default(); let date_str = dt.format("%Y-%m-%d").to_string(); if date_str < start_str.to_string() || date_str > end_str.to_string() { continue; } let open = parse_price(opens.and_then(|a| a.get(i))); let high = parse_price(highs.and_then(|a| a.get(i))); let low = parse_price(lows.and_then(|a| a.get(i))); let close = parse_price(closes.and_then(|a| a.get(i))); let adj_close = parse_price(adj_closes.and_then(|a| a.get(i))); let volume = parse_volume(volumes.and_then(|a| a.get(i))); all_prices.push(CompanyPrice { ticker: ticker.to_string(), date: date_str, time: "".to_string(), open, high, low, close, adj_close, volume, currency: currency.clone(), }); } sleep(TokioDuration::from_millis(200)).await; current = actual_end; } all_prices.sort_by_key(|p| (p.date.clone(), p.time.clone())); all_prices.dedup_by(|a, b| a.date == b.date && a.time == b.time); println!(" Got {} daily bars for {ticker}", all_prices.len()); Ok(all_prices) } pub async fn fetch_price_history_5min( ticker: &str, _start: &str, _end: &str, ) -> anyhow::Result> { let now = Utc::now().timestamp(); let period1 = now - 5184000; let period2 = now; let url = format!( "https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=5m&includeAdjustedClose=true" ); let json: Value = HttpClient::new() .get(&url) .header("User-Agent", USER_AGENT) .send() .await? .json() .await?; let result = &json["chart"]["result"][0]; let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?; let quote = &result["indicators"]["quote"][0]; let meta = &result["meta"]; let currency = meta["currency"].as_str().unwrap_or("USD").to_string(); let mut prices = Vec::new(); for (i, ts_val) in timestamps.iter().enumerate() { let ts = ts_val.as_i64().unwrap_or(0); let dt: DateTime = DateTime::from_timestamp(ts, 0).unwrap_or_default(); let date_str = dt.format("%Y-%m-%d").to_string(); let time_str = dt.format("%H:%M:%S").to_string(); let open = parse_price(quote["open"].as_array().and_then(|a| a.get(i))); let high = parse_price(quote["high"].as_array().and_then(|a| a.get(i))); let low = parse_price(quote["low"].as_array().and_then(|a| a.get(i))); let close = parse_price(quote["close"].as_array().and_then(|a| a.get(i))); let volume = parse_volume(quote["volume"].as_array().and_then(|a| a.get(i))); prices.push(CompanyPrice { ticker: ticker.to_string(), date: date_str, time: time_str, open, high, low, close, adj_close: close, volume, currency: currency.clone(), }); } prices.sort_by_key(|p| (p.date.clone(), p.time.clone())); Ok(prices) } /// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF /// Overengineered; we could just use the static URL, but this shows how to scrape if needed pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result { let url = format!("https://www.gleif.org/de/lei-data/lei-mapping/download-isin-to-lei-relationship-files"); client.goto(&url).await?; let html = client.source().await?; let _document = Html::parse_document(&html); let _row_sel = Selector::parse("table tbody tr").unwrap(); let isin_lei = "".to_string(); Ok(isin_lei) } pub async fn download_isin_lei_csv() -> anyhow::Result> { let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download"; let paths = DataPaths::new(".")?; let gleif_cache_dir = paths.cache_gleif_dir(); if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) { let msg = format!("Failed to create cache/gleif directory: {}", e); logger::log_error(&msg).await; return Ok(None); } logger::log_info("Downloading ISIN/LEI mapping from GLEIF...").await; let client = match reqwest::Client::builder() .user_agent(USER_AGENT) .timeout(std::time::Duration::from_secs(30)) .build() { Ok(c) => c, Err(e) => { logger::log_error(&format!("Failed to create HTTP client: {}", e)).await; return Ok(None); } }; let resp = match client.get(url).send().await { Ok(r) if r.status().is_success() => r, Ok(resp) => { logger::log_error(&format!("Server returned HTTP {}", resp.status())).await; return Ok(None); } Err(e) => { logger::log_error(&format!("Failed to download: {}", e)).await; return Ok(None); } }; let filename = resp .headers() .get("content-disposition") .and_then(|h| h.to_str().ok()) .and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string())) .unwrap_or_else(|| "isin_lei.zip".to_string()); let parsed_filename = parse_gleif_filename(&filename); logger::log_info(&format!("Downloaded: {} -> {}", filename, parsed_filename)).await; // Extract date from filename let mut date_str = String::new(); if let Some(start_idx) = parsed_filename.find("isin-lei-") { let rest = &parsed_filename[start_idx + 9..]; if rest.len() >= 8 { date_str = rest[0..8].to_string(); } } let date_dir = if !date_str.is_empty() { let p = gleif_cache_dir.join(&date_str); if let Err(e) = std::fs::create_dir_all(&p) { logger::log_warn(&format!("Failed to create date directory: {}", e)).await; None } else { Some(p) } } else { None }; let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf()); // Check for existing clean CSV if let Some(ref ddir) = date_dir { if let Ok(entries) = std::fs::read_dir(ddir) { for entry in entries.flatten() { if let Some(name) = entry.file_name().to_str() { if name.to_lowercase().ends_with("_clean.csv") { let path = ddir.join(name); logger::log_info(&format!("Found existing clean CSV: {}", path.display())).await; return Ok(Some(path.to_string_lossy().to_string())); } } } } } let csv_candidate = target_dir.join(parsed_filename.replace(".zip", ".csv")); if csv_candidate.exists() { logger::log_info(&format!("Found existing CSV: {}", csv_candidate.display())).await; return Ok(Some(csv_candidate.to_string_lossy().to_string())); } let bytes = match resp.bytes().await { Ok(b) => b, Err(e) => { logger::log_error(&format!("Failed to read bytes: {}", e)).await; return Ok(None); } }; let zip_path = target_dir.join(&parsed_filename); let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv")); if let Err(e) = tokio::fs::write(&zip_path, &bytes).await { logger::log_error(&format!("Failed to write ZIP: {}", e)).await; return Ok(None); } // Extract CSV from ZIP let archive = match std::fs::File::open(&zip_path).map(ZipArchive::new) { Ok(Ok(a)) => a, Ok(Err(e)) => { logger::log_error(&format!("Invalid ZIP: {}", e)).await; return Ok(None); } Err(e) => { logger::log_error(&format!("Cannot open ZIP: {}", e)).await; return Ok(None); } }; let mut archive = archive; let idx = match (0..archive.len()).find(|&i| { archive.by_index(i) .map(|f| f.name().ends_with(".csv")) .unwrap_or(false) }) { Some(i) => i, None => { logger::log_error("ZIP contains no CSV").await; return Ok(None); } }; let mut csv_file = match archive.by_index(idx) { Ok(f) => f, Err(e) => { logger::log_error(&format!("Failed to read CSV: {}", e)).await; return Ok(None); } }; let mut csv_bytes = Vec::new(); if let Err(e) = csv_file.read_to_end(&mut csv_bytes) { logger::log_error(&format!("Failed to extract: {}", e)).await; return Ok(None); } if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await { logger::log_error(&format!("Failed to save CSV: {}", e)).await; return Ok(None); } logger::log_info(&format!("✓ CSV extracted: {:?}", csv_path)).await; Ok(Some(csv_path.to_string_lossy().to_string())) } fn parse_gleif_filename(filename: &str) -> String { if let Some(start_idx) = filename.find("isin-lei-") { let rest = &filename[start_idx + 9..]; if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) { let date_part = &rest[0..8]; if date_part.len() == 8 { let year = &date_part[0..4]; let month = &date_part[4..6]; let day = &date_part[6..8]; let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" }; return format!("isin-lei-{}{}{}{}", day, month, year, extension); } } } filename.to_string() } pub async fn load_isin_lei_csv() -> anyhow::Result>> { // 1. Download + extract the CSV (this is now async) let csv_path = match download_isin_lei_csv().await? { Some(p) => p, None => { println!("ISIN/LEI download failed; continuing with empty map"); return Ok(HashMap::new()); } }; // 2. Open and parse the CSV synchronously (fast enough, ~8M lines is fine) let file = match std::fs::File::open(&csv_path) { Ok(f) => f, Err(e) => { println!("Cannot open CSV '{}': {}", csv_path, e); return Ok(HashMap::new()); } }; let mut rdr = csv::ReaderBuilder::new() .has_headers(false) .from_reader(std::io::BufReader::new(file)); let mut map: HashMap> = HashMap::new(); for result in rdr.records() { let record = match result { Ok(r) => r, Err(e) => { println!("CSV parse error: {}", e); continue; } }; if record.len() < 2 { continue; } let lei = record[0].to_string(); let isin = record[1].to_string(); map.entry(lei).or_default().push(isin); } println!("Loaded ISIN↔LEI map with {} LEIs and {} total ISINs", map.len(), map.values().map(|v| v.len()).sum::() ); Ok(map) }