Files
WebScraper/src/corporate/scraper.rs

414 lines
14 KiB
Rust

// src/corporate/scraper.rs
use super::{types::*};
//use crate::corporate::openfigi::OpenFigiClient;
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
use fantoccini::{Client};
use scraper::{Html, Selector};
use chrono::{DateTime, Duration, NaiveDate, Utc};
use tokio::{time::{Duration as TokioDuration, sleep}};
use reqwest::Client as HttpClient;
use serde_json::{json, Value};
use zip::ZipArchive;
use std::{collections::HashMap};
use std::io::{Read};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
fn parse_price(v: Option<&Value>) -> f64 {
v.and_then(|x| x.as_str())
.and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
.or_else(|| v.and_then(|x| x.as_f64()))
.unwrap_or(0.0)
}
fn parse_volume(v: Option<&Value>) -> u64 {
v.and_then(|x| x.as_str())
.and_then(|s| s.replace(',', "").parse::<u64>().ok())
.or_else(|| v.and_then(|x| x.as_u64()))
.unwrap_or(0)
}
pub async fn fetch_daily_price_history(
ticker: &str,
start_str: &str,
end_str: &str,
) -> anyhow::Result<Vec<CompanyPrice>> {
let start = NaiveDate::parse_from_str(start_str, "%Y-%m-%d")?;
let end = NaiveDate::parse_from_str(end_str, "%Y-%m-%d")? + Duration::days(1);
let mut all_prices = Vec::new();
let mut current = start;
while current < end {
let chunk_end = current + Duration::days(730);
let actual_end = chunk_end.min(end);
let period1 = current.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
let period2 = actual_end.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
println!(" Fetching {ticker} {}{}", current, actual_end - Duration::days(1));
let url = format!(
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=1d&includeAdjustedClose=true"
);
let json: Value = HttpClient::new()
.get(&url)
.header("User-Agent", USER_AGENT)
.send()
.await?
.json()
.await?;
let result = &json["chart"]["result"][0];
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
let quote = &result["indicators"]["quote"][0];
let meta = &result["meta"];
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
let opens = quote["open"].as_array();
let highs = quote["high"].as_array();
let lows = quote["low"].as_array();
let closes = quote["close"].as_array();
let adj_closes = result["indicators"]["adjclose"][0]["adjclose"].as_array()
.or_else(|| closes);
let volumes = quote["volume"].as_array();
for (i, ts_val) in timestamps.iter().enumerate() {
let ts = ts_val.as_i64().unwrap_or(0);
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
let date_str = dt.format("%Y-%m-%d").to_string();
if date_str < start_str.to_string() || date_str > end_str.to_string() {
continue;
}
let open = parse_price(opens.and_then(|a| a.get(i)));
let high = parse_price(highs.and_then(|a| a.get(i)));
let low = parse_price(lows.and_then(|a| a.get(i)));
let close = parse_price(closes.and_then(|a| a.get(i)));
let adj_close = parse_price(adj_closes.and_then(|a| a.get(i)));
let volume = parse_volume(volumes.and_then(|a| a.get(i)));
all_prices.push(CompanyPrice {
ticker: ticker.to_string(),
date: date_str,
time: "".to_string(),
open,
high,
low,
close,
adj_close,
volume,
currency: currency.clone(),
});
}
sleep(TokioDuration::from_millis(200)).await;
current = actual_end;
}
all_prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
all_prices.dedup_by(|a, b| a.date == b.date && a.time == b.time);
println!(" Got {} daily bars for {ticker}", all_prices.len());
Ok(all_prices)
}
pub async fn fetch_price_history_5min(
ticker: &str,
_start: &str,
_end: &str,
) -> anyhow::Result<Vec<CompanyPrice>> {
let now = Utc::now().timestamp();
let period1 = now - 5184000;
let period2 = now;
let url = format!(
"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=5m&includeAdjustedClose=true"
);
let json: Value = HttpClient::new()
.get(&url)
.header("User-Agent", USER_AGENT)
.send()
.await?
.json()
.await?;
let result = &json["chart"]["result"][0];
let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
let quote = &result["indicators"]["quote"][0];
let meta = &result["meta"];
let currency = meta["currency"].as_str().unwrap_or("USD").to_string();
let mut prices = Vec::new();
for (i, ts_val) in timestamps.iter().enumerate() {
let ts = ts_val.as_i64().unwrap_or(0);
let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
let date_str = dt.format("%Y-%m-%d").to_string();
let time_str = dt.format("%H:%M:%S").to_string();
let open = parse_price(quote["open"].as_array().and_then(|a| a.get(i)));
let high = parse_price(quote["high"].as_array().and_then(|a| a.get(i)));
let low = parse_price(quote["low"].as_array().and_then(|a| a.get(i)));
let close = parse_price(quote["close"].as_array().and_then(|a| a.get(i)));
let volume = parse_volume(quote["volume"].as_array().and_then(|a| a.get(i)));
prices.push(CompanyPrice {
ticker: ticker.to_string(),
date: date_str,
time: time_str,
open,
high,
low,
close,
adj_close: close,
volume,
currency: currency.clone(),
});
}
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
Ok(prices)
}
/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF
/// Overengineered; we could just use the static URL, but this shows how to scrape if needed
pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> {
let url = format!("https://www.gleif.org/de/lei-data/lei-mapping/download-isin-to-lei-relationship-files");
client.goto(&url).await?;
let html = client.source().await?;
let _document = Html::parse_document(&html);
let _row_sel = Selector::parse("table tbody tr").unwrap();
let isin_lei = "".to_string();
Ok(isin_lei)
}
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
let paths = DataPaths::new(".")?;
let gleif_cache_dir = paths.cache_gleif_dir();
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
let msg = format!("Failed to create cache/gleif directory: {}", e);
logger::log_error(&msg).await;
return Ok(None);
}
logger::log_info("Downloading ISIN/LEI mapping from GLEIF...").await;
let client = match reqwest::Client::builder()
.user_agent(USER_AGENT)
.timeout(std::time::Duration::from_secs(30))
.build()
{
Ok(c) => c,
Err(e) => {
logger::log_error(&format!("Failed to create HTTP client: {}", e)).await;
return Ok(None);
}
};
let resp = match client.get(url).send().await {
Ok(r) if r.status().is_success() => r,
Ok(resp) => {
logger::log_error(&format!("Server returned HTTP {}", resp.status())).await;
return Ok(None);
}
Err(e) => {
logger::log_error(&format!("Failed to download: {}", e)).await;
return Ok(None);
}
};
let filename = resp
.headers()
.get("content-disposition")
.and_then(|h| h.to_str().ok())
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
.unwrap_or_else(|| "isin_lei.zip".to_string());
let parsed_filename = parse_gleif_filename(&filename);
logger::log_info(&format!("Downloaded: {} -> {}", filename, parsed_filename)).await;
// Extract date from filename
let mut date_str = String::new();
if let Some(start_idx) = parsed_filename.find("isin-lei-") {
let rest = &parsed_filename[start_idx + 9..];
if rest.len() >= 8 {
date_str = rest[0..8].to_string();
}
}
let date_dir = if !date_str.is_empty() {
let p = gleif_cache_dir.join(&date_str);
if let Err(e) = std::fs::create_dir_all(&p) {
logger::log_warn(&format!("Failed to create date directory: {}", e)).await;
None
} else {
Some(p)
}
} else {
None
};
let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());
// Check for existing clean CSV
if let Some(ref ddir) = date_dir {
if let Ok(entries) = std::fs::read_dir(ddir) {
for entry in entries.flatten() {
if let Some(name) = entry.file_name().to_str() {
if name.to_lowercase().ends_with("_clean.csv") {
let path = ddir.join(name);
logger::log_info(&format!("Found existing clean CSV: {}", path.display())).await;
return Ok(Some(path.to_string_lossy().to_string()));
}
}
}
}
}
let csv_candidate = target_dir.join(parsed_filename.replace(".zip", ".csv"));
if csv_candidate.exists() {
logger::log_info(&format!("Found existing CSV: {}", csv_candidate.display())).await;
return Ok(Some(csv_candidate.to_string_lossy().to_string()));
}
let bytes = match resp.bytes().await {
Ok(b) => b,
Err(e) => {
logger::log_error(&format!("Failed to read bytes: {}", e)).await;
return Ok(None);
}
};
let zip_path = target_dir.join(&parsed_filename);
let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
logger::log_error(&format!("Failed to write ZIP: {}", e)).await;
return Ok(None);
}
// Extract CSV from ZIP
let archive = match std::fs::File::open(&zip_path).map(ZipArchive::new) {
Ok(Ok(a)) => a,
Ok(Err(e)) => {
logger::log_error(&format!("Invalid ZIP: {}", e)).await;
return Ok(None);
}
Err(e) => {
logger::log_error(&format!("Cannot open ZIP: {}", e)).await;
return Ok(None);
}
};
let mut archive = archive;
let idx = match (0..archive.len()).find(|&i| {
archive.by_index(i)
.map(|f| f.name().ends_with(".csv"))
.unwrap_or(false)
}) {
Some(i) => i,
None => {
logger::log_error("ZIP contains no CSV").await;
return Ok(None);
}
};
let mut csv_file = match archive.by_index(idx) {
Ok(f) => f,
Err(e) => {
logger::log_error(&format!("Failed to read CSV: {}", e)).await;
return Ok(None);
}
};
let mut csv_bytes = Vec::new();
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
logger::log_error(&format!("Failed to extract: {}", e)).await;
return Ok(None);
}
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
logger::log_error(&format!("Failed to save CSV: {}", e)).await;
return Ok(None);
}
logger::log_info(&format!("✓ CSV extracted: {:?}", csv_path)).await;
Ok(Some(csv_path.to_string_lossy().to_string()))
}
fn parse_gleif_filename(filename: &str) -> String {
if let Some(start_idx) = filename.find("isin-lei-") {
let rest = &filename[start_idx + 9..];
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
let date_part = &rest[0..8];
if date_part.len() == 8 {
let year = &date_part[0..4];
let month = &date_part[4..6];
let day = &date_part[6..8];
let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" };
return format!("isin-lei-{}{}{}{}", day, month, year, extension);
}
}
}
filename.to_string()
}
pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
// 1. Download + extract the CSV (this is now async)
let csv_path = match download_isin_lei_csv().await? {
Some(p) => p,
None => {
println!("ISIN/LEI download failed; continuing with empty map");
return Ok(HashMap::new());
}
};
// 2. Open and parse the CSV synchronously (fast enough, ~8M lines is fine)
let file = match std::fs::File::open(&csv_path) {
Ok(f) => f,
Err(e) => {
println!("Cannot open CSV '{}': {}", csv_path, e);
return Ok(HashMap::new());
}
};
let mut rdr = csv::ReaderBuilder::new()
.has_headers(false)
.from_reader(std::io::BufReader::new(file));
let mut map: HashMap<String, Vec<String>> = HashMap::new();
for result in rdr.records() {
let record = match result {
Ok(r) => r,
Err(e) => {
println!("CSV parse error: {}", e);
continue;
}
};
if record.len() < 2 { continue; }
let lei = record[0].to_string();
let isin = record[1].to_string();
map.entry(lei).or_default().push(isin);
}
println!("Loaded ISIN↔LEI map with {} LEIs and {} total ISINs",
map.len(),
map.values().map(|v| v.len()).sum::<usize>()
);
Ok(map)
}