WebScraper/src/corporate/scraper.rs

// src/corporate/scraper.rs
use super::{types::*};
//use crate::corporate::openfigi::OpenFigiClient;
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
use fantoccini::{Client};
use scraper::{Html, Selector};
use chrono::{DateTime, Duration, NaiveDate, Utc};
use tokio::{time::{Duration as TokioDuration, sleep}};
use reqwest::Client as HttpClient;
use serde_json::{json, Value};
use zip::ZipArchive;
use std::{collections::HashMap};
use std::io::{Read};

const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";

fn parse_price(v: Option<&Value>) -> f64 {
    v.and_then(|x| x.as_str())
        .and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
        .or_else(|| v.and_then(|x| x.as_f64()))
        .unwrap_or(0.0)
}

fn parse_volume(v: Option<&Value>) -> u64 {
    v.and_then(|x| x.as_str())
        .and_then(|s| s.replace(',', "").parse::<u64>().ok())
        .or_else(|| v.and_then(|x| x.as_u64()))
        .unwrap_or(0)
}

pub async fn fetch_daily_price_history(
    ticker: &str,
    start_str: &str,
    end_str: &str,
) -> anyhow::Result<Vec<CompanyPrice>> {
    let start = NaiveDate::parse_from_str(start_str, "%Y-%m-%d")?;
    let end   = NaiveDate::parse_from_str(end_str, "%Y-%m-%d")? + Duration::days(1);

    let mut all_prices = Vec::new();
    let mut current = start;

    while current < end {
        let chunk_end = current + Duration::days(730);
        let actual_end = chunk_end.min(end);

        let period1 = current.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();
        let period2 = actual_end.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp();

        println!("    Fetching {ticker} {} → {}", current, actual_end - Duration::days(1));

        let url = format!(
            "https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=1d&includeAdjustedClose=true"
        );

        let json: Value = HttpClient::new()
            .get(&url)
            .header("User-Agent", USER_AGENT)
            .send()
            .await?
            .json()
            .await?;

        let result = &json["chart"]["result"][0];
        let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
        let quote = &result["indicators"]["quote"][0];
        let meta = &result["meta"];
        let currency = meta["currency"].as_str().unwrap_or("USD").to_string();

        let opens = quote["open"].as_array();
        let highs = quote["high"].as_array();
        let lows  = quote["low"].as_array();
        let closes = quote["close"].as_array();
        let adj_closes = result["indicators"]["adjclose"][0]["adjclose"].as_array()
            .or_else(|| closes);
        let volumes = quote["volume"].as_array();

        for (i, ts_val) in timestamps.iter().enumerate() {
            let ts = ts_val.as_i64().unwrap_or(0);
            let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
            let date_str = dt.format("%Y-%m-%d").to_string();

            if date_str < start_str.to_string() || date_str > end_str.to_string() {
                continue;
            }

            let open = parse_price(opens.and_then(|a| a.get(i)));
            let high = parse_price(highs.and_then(|a| a.get(i)));
            let low  = parse_price(lows.and_then(|a| a.get(i)));
            let close = parse_price(closes.and_then(|a| a.get(i)));
            let adj_close = parse_price(adj_closes.and_then(|a| a.get(i)));
            let volume = parse_volume(volumes.and_then(|a| a.get(i)));

            all_prices.push(CompanyPrice {
                ticker: ticker.to_string(),
                date: date_str,
                time: "".to_string(),
                open,
                high,
                low,
                close,
                adj_close,
                volume,
                currency: currency.clone(),
            });
        }

        sleep(TokioDuration::from_millis(200)).await;
        current = actual_end;
    }

    all_prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
    all_prices.dedup_by(|a, b| a.date == b.date && a.time == b.time);

    println!("    Got {} daily bars for {ticker}", all_prices.len());
    Ok(all_prices)
}

pub async fn fetch_price_history_5min(
    ticker: &str,
    _start: &str,
    _end: &str,
) -> anyhow::Result<Vec<CompanyPrice>> {
    let now = Utc::now().timestamp();
    let period1 = now - 5184000;
    let period2 = now;

    let url = format!(
        "https://query1.finance.yahoo.com/v8/finance/chart/{ticker}?period1={period1}&period2={period2}&interval=5m&includeAdjustedClose=true"
    );

    let json: Value = HttpClient::new()
        .get(&url)
        .header("User-Agent", USER_AGENT)
        .send()
        .await?
        .json()
        .await?;

    let result = &json["chart"]["result"][0];
    let timestamps = result["timestamp"].as_array().ok_or_else(|| anyhow::anyhow!("No timestamps"))?;
    let quote = &result["indicators"]["quote"][0];
    let meta = &result["meta"];
    let currency = meta["currency"].as_str().unwrap_or("USD").to_string();

    let mut prices = Vec::new();

    for (i, ts_val) in timestamps.iter().enumerate() {
        let ts = ts_val.as_i64().unwrap_or(0);
        let dt: DateTime<Utc> = DateTime::from_timestamp(ts, 0).unwrap_or_default();
        let date_str = dt.format("%Y-%m-%d").to_string();
        let time_str = dt.format("%H:%M:%S").to_string();

        let open = parse_price(quote["open"].as_array().and_then(|a| a.get(i)));
        let high = parse_price(quote["high"].as_array().and_then(|a| a.get(i)));
        let low  = parse_price(quote["low"].as_array().and_then(|a| a.get(i)));
        let close = parse_price(quote["close"].as_array().and_then(|a| a.get(i)));
        let volume = parse_volume(quote["volume"].as_array().and_then(|a| a.get(i)));

        prices.push(CompanyPrice {
            ticker: ticker.to_string(),
            date: date_str,
            time: time_str,
            open,
            high,
            low,
            close,
            adj_close: close,
            volume,
            currency: currency.clone(),
        });
    }

    prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
    Ok(prices)
}

/// Fetch the URL of the latest ISIN↔LEI mapping CSV from GLEIF
/// Overengineered; we could just use the static URL, but this shows how to scrape if needed
pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow::Result<String> {
    let url = format!("https://www.gleif.org/de/lei-data/lei-mapping/download-isin-to-lei-relationship-files");
    client.goto(&url).await?;

    let html = client.source().await?;
    let _document = Html::parse_document(&html);
    let _row_sel = Selector::parse("table tbody tr").unwrap();
    let isin_lei = "".to_string();

    Ok(isin_lei)
}

pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
    let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";

    let paths = DataPaths::new(".")?;
    let gleif_cache_dir = paths.cache_gleif_dir();

    if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
        let msg = format!("Failed to create cache/gleif directory: {}", e);
        logger::log_error(&msg).await;
        return Ok(None);
    }

    logger::log_info("Downloading ISIN/LEI mapping from GLEIF...").await;

    let client = match reqwest::Client::builder()
        .user_agent(USER_AGENT)
        .timeout(std::time::Duration::from_secs(30))
        .build()
    {
        Ok(c) => c,
        Err(e) => {
            logger::log_error(&format!("Failed to create HTTP client: {}", e)).await;
            return Ok(None);
        }
    };

    let resp = match client.get(url).send().await {
        Ok(r) if r.status().is_success() => r,
        Ok(resp) => {
            logger::log_error(&format!("Server returned HTTP {}", resp.status())).await;
            return Ok(None);
        }
        Err(e) => {
            logger::log_error(&format!("Failed to download: {}", e)).await;
            return Ok(None);
        }
    };

    let filename = resp
        .headers()
        .get("content-disposition")
        .and_then(|h| h.to_str().ok())
        .and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
        .unwrap_or_else(|| "isin_lei.zip".to_string());

    let parsed_filename = parse_gleif_filename(&filename);
    logger::log_info(&format!("Downloaded: {} -> {}", filename, parsed_filename)).await;

    // Extract date from filename
    let mut date_str = String::new();
    if let Some(start_idx) = parsed_filename.find("isin-lei-") {
        let rest = &parsed_filename[start_idx + 9..];
        if rest.len() >= 8 {
            date_str = rest[0..8].to_string();
        }
    }

    let date_dir = if !date_str.is_empty() {
        let p = gleif_cache_dir.join(&date_str);
        if let Err(e) = std::fs::create_dir_all(&p) {
            logger::log_warn(&format!("Failed to create date directory: {}", e)).await;
            None
        } else {
            Some(p)
        }
    } else {
        None
    };

    let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());

    // Check for existing clean CSV
    if let Some(ref ddir) = date_dir {
        if let Ok(entries) = std::fs::read_dir(ddir) {
            for entry in entries.flatten() {
                if let Some(name) = entry.file_name().to_str() {
                    if name.to_lowercase().ends_with("_clean.csv") {
                        let path = ddir.join(name);
                        logger::log_info(&format!("Found existing clean CSV: {}", path.display())).await;
                        return Ok(Some(path.to_string_lossy().to_string()));
                    }
                }
            }
        }
    }

    let csv_candidate = target_dir.join(parsed_filename.replace(".zip", ".csv"));
    if csv_candidate.exists() {
        logger::log_info(&format!("Found existing CSV: {}", csv_candidate.display())).await;
        return Ok(Some(csv_candidate.to_string_lossy().to_string()));
    }

    let bytes = match resp.bytes().await {
        Ok(b) => b,
        Err(e) => {
            logger::log_error(&format!("Failed to read bytes: {}", e)).await;
            return Ok(None);
        }
    };

    let zip_path = target_dir.join(&parsed_filename);
    let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));

    if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
        logger::log_error(&format!("Failed to write ZIP: {}", e)).await;
        return Ok(None);
    }

    // Extract CSV from ZIP
    let archive = match std::fs::File::open(&zip_path).map(ZipArchive::new) {
        Ok(Ok(a)) => a,
        Ok(Err(e)) => {
            logger::log_error(&format!("Invalid ZIP: {}", e)).await;
            return Ok(None);
        }
        Err(e) => {
            logger::log_error(&format!("Cannot open ZIP: {}", e)).await;
            return Ok(None);
        }
    };

    let mut archive = archive;
    let idx = match (0..archive.len()).find(|&i| {
        archive.by_index(i)
            .map(|f| f.name().ends_with(".csv"))
            .unwrap_or(false)
    }) {
        Some(i) => i,
        None => {
            logger::log_error("ZIP contains no CSV").await;
            return Ok(None);
        }
    };

    let mut csv_file = match archive.by_index(idx) {
        Ok(f) => f,
        Err(e) => {
            logger::log_error(&format!("Failed to read CSV: {}", e)).await;
            return Ok(None);
        }
    };

    let mut csv_bytes = Vec::new();
    if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
        logger::log_error(&format!("Failed to extract: {}", e)).await;
        return Ok(None);
    }

    if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
        logger::log_error(&format!("Failed to save CSV: {}", e)).await;
        return Ok(None);
    }

    logger::log_info(&format!("✓ CSV extracted: {:?}", csv_path)).await;
    Ok(Some(csv_path.to_string_lossy().to_string()))
}

fn parse_gleif_filename(filename: &str) -> String {
    if let Some(start_idx) = filename.find("isin-lei-") {
        let rest = &filename[start_idx + 9..];

        if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
            let date_part = &rest[0..8];
            if date_part.len() == 8 {
                let year = &date_part[0..4];
                let month = &date_part[4..6];
                let day = &date_part[6..8];
                let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" };
                return format!("isin-lei-{}{}{}{}", day, month, year, extension);
            }
        }
    }

    filename.to_string()
}

pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
    // 1. Download + extract the CSV (this is now async)
    let csv_path = match download_isin_lei_csv().await? {
        Some(p) => p,
        None => {
            println!("ISIN/LEI download failed; continuing with empty map");
            return Ok(HashMap::new());
        }
    };

    // 2. Open and parse the CSV synchronously (fast enough, ~8M lines is fine)
    let file = match std::fs::File::open(&csv_path) {
        Ok(f) => f,
        Err(e) => {
            println!("Cannot open CSV '{}': {}", csv_path, e);
            return Ok(HashMap::new());
        }
    };

    let mut rdr = csv::ReaderBuilder::new()
        .has_headers(false)
        .from_reader(std::io::BufReader::new(file));

    let mut map: HashMap<String, Vec<String>> = HashMap::new();

    for result in rdr.records() {
        let record = match result {
            Ok(r) => r,
            Err(e) => {
                println!("CSV parse error: {}", e);
                continue;
            }
        };

        if record.len() < 2 { continue; }

        let lei = record[0].to_string();
        let isin = record[1].to_string();
        map.entry(lei).or_default().push(isin);
    }

    println!("Loaded ISIN↔LEI map with {} LEIs and {} total ISINs",
        map.len(),
        map.values().map(|v| v.len()).sum::<usize>()
    );

    Ok(map)
}