414 lines
14 KiB
Rust
414 lines
14 KiB
Rust
// src/corporate/scraper.rs
|
|
use super::{types::*};
|
|
//use crate::corporate::openfigi::OpenFigiClient;
|
|
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
|
|
use fantoccini::{Client};
|
|
use scraper::{Html, Selector};
|
|
use chrono::{DateTime, Duration, NaiveDate, Utc};
|
|
use tokio::{time::{Duration as TokioDuration, sleep}};
|
|
use reqwest::Client as HttpClient;
|
|
use serde_json::{json, Value};
|
|
use zip::ZipArchive;
|
|
use std::{collections::HashMap};
|
|
use std::io::{Read};
|
|
|
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
|
|
|
fn parse_price(v: Option<&Value>) -> f64 {
|
|
v.and_then(|x| x.as_str())
|
|
.and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
|
|
.or_else(|| v.and_then(|x| x.as_f64()))
|
|
.unwrap_or(0.0)
|
|
}
|
|
|
|
fn parse_volume(v: Option<&Value>) -> u64 {
|
|
v.and_then(|x| x.as_str())
|
|
.and_then(|s| s.replace(',', "").parse::<u64>().ok())
|
|
.or_else(|| v.and_then(|x| x.as_u64()))
|
|
.unwrap_or(0)
|
|
}
|
|
|
|
pub async fn fetch_daily_price_history(
|
|
ticker: &str,
|
|
start_str: &str,
|
|
end_str: &str,
|
|
) -> anyhow::Result<Vec<CompanyPrice>> {
|
|
let start = NaiveDate::parse_from_str(start_str, "%Y-%m-%d")?;
|
|
let end = NaiveDate::parse_from_str(end_str, "%Y-%m-%d")? + Duration::days(1);
|
|
|
|
let mut all_prices = Vec::new();
|
|
let mut current = start;
|
|
|
|
while current < end {
|
|
let chunk_end = current + Duration::days(730);
|
|
let actual_end = chunk_end.min(end);
|
|
|
|
let period1 = current.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
|
|
let period2 = actual_end.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
|
|
|
|
println!(" Fetching {ticker} {} → {}", current, actual_end - Duration::days(1));
|
|
|
|
let url = format!(
|
|
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=1d&includeAdjustedClose=true"
|
|
);
|
|
|
|
let json: Value = HttpClient::new()
|
|
.get(&url)
|
|
.header("User-Agent", USER_AGENT)
|
|
.send()
|
|
.await?
|
|
.json()
|
|
.await?;
|
|
|
|
let result = &json["chart"]["result"][0];
|
|
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
|
|
let quote = &result["indicators"]["quote"][0];
|
|
let meta = &result["meta"];
|
|
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
|
|
|
|
let opens = quote["open"].as_array();
|
|
let highs = quote["high"].as_array();
|
|
let lows = quote["low"].as_array();
|
|
let closes = quote["close"].as_array();
|
|
let adj_closes = result["indicators"]["adjclose"][0]["adjclose"].as_array()
|
|
.or_else(|| closes);
|
|
let volumes = quote["volume"].as_array();
|
|
|
|
for (i, ts_val) in timestamps.iter().enumerate() {
|
|
let ts = ts_val.as_i64().unwrap_or(0);
|
|
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
|
|
let date_str = dt.format("%Y-%m-%d").to_string();
|
|
|
|
if date_str < start_str.to_string() || date_str > end_str.to_string() {
|
|
continue;
|
|
}
|
|
|
|
let open = parse_price(opens.and_then(|a| a.get(i)));
|
|
let high = parse_price(highs.and_then(|a| a.get(i)));
|
|
let low = parse_price(lows.and_then(|a| a.get(i)));
|
|
let close = parse_price(closes.and_then(|a| a.get(i)));
|
|
let adj_close = parse_price(adj_closes.and_then(|a| a.get(i)));
|
|
let volume = parse_volume(volumes.and_then(|a| a.get(i)));
|
|
|
|
all_prices.push(CompanyPrice {
|
|
ticker: ticker.to_string(),
|
|
date: date_str,
|
|
time: "".to_string(),
|
|
open,
|
|
high,
|
|
low,
|
|
close,
|
|
adj_close,
|
|
volume,
|
|
currency: currency.clone(),
|
|
});
|
|
}
|
|
|
|
sleep(TokioDuration::from_millis(200)).await;
|
|
current = actual_end;
|
|
}
|
|
|
|
all_prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
|
all_prices.dedup_by(|a, b| a.date == b.date && a.time == b.time);
|
|
|
|
println!(" Got {} daily bars for {ticker}", all_prices.len());
|
|
Ok(all_prices)
|
|
}
|
|
|
|
pub async fn fetch_price_history_5min(
|
|
ticker: &str,
|
|
_start: &str,
|
|
_end: &str,
|
|
) -> anyhow::Result<Vec<CompanyPrice>> {
|
|
let now = Utc::now().timestamp();
|
|
let period1 = now - 5184000;
|
|
let period2 = now;
|
|
|
|
let url = format!(
|
|
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=5m&includeAdjustedClose=true"
|
|
);
|
|
|
|
let json: Value = HttpClient::new()
|
|
.get(&url)
|
|
.header("User-Agent", USER_AGENT)
|
|
.send()
|
|
.await?
|
|
.json()
|
|
.await?;
|
|
|
|
let result = &json["chart"]["result"][0];
|
|
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
|
|
let quote = &result["indicators"]["quote"][0];
|
|
let meta = &result["meta"];
|
|
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
|
|
|
|
let mut prices = Vec::new();
|
|
|
|
for (i, ts_val) in timestamps.iter().enumerate() {
|
|
let ts = ts_val.as_i64().unwrap_or(0);
|
|
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
|
|
let date_str = dt.format("%Y-%m-%d").to_string();
|
|
let time_str = dt.format("%H:%M:%S").to_string();
|
|
|
|
let open = parse_price(quote["open"].as_array().and_then(|a| a.get(i)));
|
|
let high = parse_price(quote["high"].as_array().and_then(|a| a.get(i)));
|
|
let low = parse_price(quote["low"].as_array().and_then(|a| a.get(i)));
|
|
let close = parse_price(quote["close"].as_array().and_then(|a| a.get(i)));
|
|
let volume = parse_volume(quote["volume"].as_array().and_then(|a| a.get(i)));
|
|
|
|
prices.push(CompanyPrice {
|
|
ticker: ticker.to_string(),
|
|
date: date_str,
|
|
time: time_str,
|
|
open,
|
|
high,
|
|
low,
|
|
close,
|
|
adj_close: close,
|
|
volume,
|
|
currency: currency.clone(),
|
|
});
|
|
}
|
|
|
|
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
|
Ok(prices)
|
|
}
|
|
|
|
/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF
|
|
/// Overengineered; we could just use the static URL, but this shows how to scrape if needed
|
|
pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> {
|
|
let url = format!("https://www.gleif.org/de/lei-data/lei-mapping/download-isin-to-lei-relationship-files");
|
|
client.goto(&url).await?;
|
|
|
|
let html = client.source().await?;
|
|
let _document = Html::parse_document(&html);
|
|
let _row_sel = Selector::parse("table tbody tr").unwrap();
|
|
let isin_lei = "".to_string();
|
|
|
|
Ok(isin_lei)
|
|
}
|
|
|
|
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
|
|
|
let paths = DataPaths::new(".")?;
|
|
let gleif_cache_dir = paths.cache_gleif_dir();
|
|
|
|
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
|
|
let msg = format!("Failed to create cache/gleif directory: {}", e);
|
|
logger::log_error(&msg).await;
|
|
return Ok(None);
|
|
}
|
|
|
|
logger::log_info("Downloading ISIN/LEI mapping from GLEIF...").await;
|
|
|
|
let client = match reqwest::Client::builder()
|
|
.user_agent(USER_AGENT)
|
|
.timeout(std::time::Duration::from_secs(30))
|
|
.build()
|
|
{
|
|
Ok(c) => c,
|
|
Err(e) => {
|
|
logger::log_error(&format!("Failed to create HTTP client: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
};
|
|
|
|
let resp = match client.get(url).send().await {
|
|
Ok(r) if r.status().is_success() => r,
|
|
Ok(resp) => {
|
|
logger::log_error(&format!("Server returned HTTP {}", resp.status())).await;
|
|
return Ok(None);
|
|
}
|
|
Err(e) => {
|
|
logger::log_error(&format!("Failed to download: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
};
|
|
|
|
let filename = resp
|
|
.headers()
|
|
.get("content-disposition")
|
|
.and_then(|h| h.to_str().ok())
|
|
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
|
|
.unwrap_or_else(|| "isin_lei.zip".to_string());
|
|
|
|
let parsed_filename = parse_gleif_filename(&filename);
|
|
logger::log_info(&format!("Downloaded: {} -> {}", filename, parsed_filename)).await;
|
|
|
|
// Extract date from filename
|
|
let mut date_str = String::new();
|
|
if let Some(start_idx) = parsed_filename.find("isin-lei-") {
|
|
let rest = &parsed_filename[start_idx + 9..];
|
|
if rest.len() >= 8 {
|
|
date_str = rest[0..8].to_string();
|
|
}
|
|
}
|
|
|
|
let date_dir = if !date_str.is_empty() {
|
|
let p = gleif_cache_dir.join(&date_str);
|
|
if let Err(e) = std::fs::create_dir_all(&p) {
|
|
logger::log_warn(&format!("Failed to create date directory: {}", e)).await;
|
|
None
|
|
} else {
|
|
Some(p)
|
|
}
|
|
} else {
|
|
None
|
|
};
|
|
|
|
let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());
|
|
|
|
// Check for existing clean CSV
|
|
if let Some(ref ddir) = date_dir {
|
|
if let Ok(entries) = std::fs::read_dir(ddir) {
|
|
for entry in entries.flatten() {
|
|
if let Some(name) = entry.file_name().to_str() {
|
|
if name.to_lowercase().ends_with("_clean.csv") {
|
|
let path = ddir.join(name);
|
|
logger::log_info(&format!("Found existing clean CSV: {}", path.display())).await;
|
|
return Ok(Some(path.to_string_lossy().to_string()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let csv_candidate = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
|
if csv_candidate.exists() {
|
|
logger::log_info(&format!("Found existing CSV: {}", csv_candidate.display())).await;
|
|
return Ok(Some(csv_candidate.to_string_lossy().to_string()));
|
|
}
|
|
|
|
let bytes = match resp.bytes().await {
|
|
Ok(b) => b,
|
|
Err(e) => {
|
|
logger::log_error(&format!("Failed to read bytes: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
};
|
|
|
|
let zip_path = target_dir.join(&parsed_filename);
|
|
let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
|
|
|
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
|
|
logger::log_error(&format!("Failed to write ZIP: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
|
|
// Extract CSV from ZIP
|
|
let archive = match std::fs::File::open(&zip_path).map(ZipArchive::new) {
|
|
Ok(Ok(a)) => a,
|
|
Ok(Err(e)) => {
|
|
logger::log_error(&format!("Invalid ZIP: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
Err(e) => {
|
|
logger::log_error(&format!("Cannot open ZIP: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
};
|
|
|
|
let mut archive = archive;
|
|
let idx = match (0..archive.len()).find(|&i| {
|
|
archive.by_index(i)
|
|
.map(|f| f.name().ends_with(".csv"))
|
|
.unwrap_or(false)
|
|
}) {
|
|
Some(i) => i,
|
|
None => {
|
|
logger::log_error("ZIP contains no CSV").await;
|
|
return Ok(None);
|
|
}
|
|
};
|
|
|
|
let mut csv_file = match archive.by_index(idx) {
|
|
Ok(f) => f,
|
|
Err(e) => {
|
|
logger::log_error(&format!("Failed to read CSV: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
};
|
|
|
|
let mut csv_bytes = Vec::new();
|
|
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
|
logger::log_error(&format!("Failed to extract: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
|
|
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
|
|
logger::log_error(&format!("Failed to save CSV: {}", e)).await;
|
|
return Ok(None);
|
|
}
|
|
|
|
logger::log_info(&format!("✓ CSV extracted: {:?}", csv_path)).await;
|
|
Ok(Some(csv_path.to_string_lossy().to_string()))
|
|
}
|
|
|
|
fn parse_gleif_filename(filename: &str) -> String {
|
|
if let Some(start_idx) = filename.find("isin-lei-") {
|
|
let rest = &filename[start_idx + 9..];
|
|
|
|
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
|
|
let date_part = &rest[0..8];
|
|
if date_part.len() == 8 {
|
|
let year = &date_part[0..4];
|
|
let month = &date_part[4..6];
|
|
let day = &date_part[6..8];
|
|
let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" };
|
|
return format!("isin-lei-{}{}{}{}", day, month, year, extension);
|
|
}
|
|
}
|
|
}
|
|
|
|
filename.to_string()
|
|
}
|
|
|
|
pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
|
|
// 1. Download + extract the CSV (this is now async)
|
|
let csv_path = match download_isin_lei_csv().await? {
|
|
Some(p) => p,
|
|
None => {
|
|
println!("ISIN/LEI download failed; continuing with empty map");
|
|
return Ok(HashMap::new());
|
|
}
|
|
};
|
|
|
|
// 2. Open and parse the CSV synchronously (fast enough, ~8M lines is fine)
|
|
let file = match std::fs::File::open(&csv_path) {
|
|
Ok(f) => f,
|
|
Err(e) => {
|
|
println!("Cannot open CSV '{}': {}", csv_path, e);
|
|
return Ok(HashMap::new());
|
|
}
|
|
};
|
|
|
|
let mut rdr = csv::ReaderBuilder::new()
|
|
.has_headers(false)
|
|
.from_reader(std::io::BufReader::new(file));
|
|
|
|
let mut map: HashMap<String, Vec<String>> = HashMap::new();
|
|
|
|
for result in rdr.records() {
|
|
let record = match result {
|
|
Ok(r) => r,
|
|
Err(e) => {
|
|
println!("CSV parse error: {}", e);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
if record.len() < 2 { continue; }
|
|
|
|
let lei = record[0].to_string();
|
|
let isin = record[1].to_string();
|
|
map.entry(lei).or_default().push(isin);
|
|
}
|
|
|
|
println!("Loaded ISIN↔LEI map with {} LEIs and {} total ISINs",
|
|
map.len(),
|
|
map.values().map(|v| v.len()).sum::<usize>()
|
|
);
|
|
|
|
Ok(map)
|
|
} |