added companie mapping with yahoo tickers

2025-12-14 16:48:02 +01:00
parent 00c9d45642
commit d744769138
12 changed files with 1507 additions and 2591 deletions
--- a/src/corporate/scraper.rs
+++ b/src/corporate/scraper.rs
@@ -1,318 +1,19 @@
 // src/corporate/scraper.rs
-use super::{types::*, helpers::*, openfigi::*};
+use super::{types::*};
 //use crate::corporate::openfigi::OpenFigiClient;
 use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
-use fantoccini::{Client, Locator};
+use fantoccini::{Client};
 use scraper::{Html, Selector};
 use chrono::{DateTime, Duration, NaiveDate, Utc};
 use tokio::{time::{Duration as TokioDuration, sleep}};
 use reqwest::Client as HttpClient;
 use serde_json::{json, Value};
 use zip::ZipArchive;
-use std::{collections::HashMap, sync::Arc};
+use std::{collections::HashMap};
 use std::io::{Read};
-use anyhow::{anyhow, Result};

 const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";

-/// Check if a ticker exists on Yahoo Finance and return core metadata.
-///
-/// This function calls the public Yahoo Finance quoteSummary endpoint and extracts:
-/// - ISIN (when available)
-/// - Company name
-/// - Exchange MIC code
-/// - Trading currency
-///
-/// It strictly filters to only accept **equity** securities.
-///
-/// # Arguments
-/// * `ticker` - The ticker symbol to validate (e.g., "AAPL", "7203.T", "BMW.DE")
-///
-/// # Returns
-/// `Ok(PrimaryInfo)` on success, `Err` if ticker doesn't exist, is not equity, or data is malformed.
-///
-/// # Errors
-/// - Ticker not found
-/// - Not an equity (ETF, bond, etc.)
-/// - Missing critical fields
-/// - Network or JSON parsing errors
-/*pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
-    let url = format!(
-        "https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}?modules=price%2CassetProfile",
-        ticker
-    );
-
-    let resp = match HttpClient::new()
-        .get(&url)
-        .header("User-Agent", USER_AGENT)
-        .send()
-        .await
-    {
-        Ok(resp) => resp,
-        Err(err) => {
-            return Err(anyhow::anyhow!(
-                "Failed to reach Yahoo Finance for ticker {}: {}",
-                ticker,
-                err
-            ));
-        }
-    };
-
-    if !resp.status().is_success() {
-        return Err(anyhow::anyhow!("Yahoo returned HTTP {} for ticker {}", resp.status(), ticker));
-    }
-
-    let json: Value = match resp
-        .json()
-        .await {
-            Ok(resp) => resp,
-            Err(err) => {
-                return Err(anyhow::anyhow!(
-                    "Failed to parse JSON response from Yahoo Finance {}: {}",
-                    ticker,
-                    err
-                ));
-            }
-        };
-
-    let result_array = json["quoteSummary"]["result"]
-        .as_array()
-        .ok_or_else(|| anyhow::anyhow!("Missing 'quoteSummary.result' in response"))?;
-
-    if result_array.is_empty() || result_array[0].is_null() {
-        return Err(anyhow::anyhow!("No quote data returned for ticker {}", ticker));
-    }
-
-    let quote = &result_array[0]["price"];
-    let profile = &result_array[0]["assetProfile"];
-
-    // === 1. Must be EQUITY ===
-    let quote_type = quote["quoteType"]
-        .as_str()
-        .unwrap_or("")
-        .to_ascii_uppercase();
-
-    if quote_type != "EQUITY" {
-        println!("      → Skipping {} (quoteType: {})", ticker, quote_type);
-        return Err(anyhow::anyhow!("Not an equity security: {}", quote_type));
-    }
-
-    // === 2. Extract basic info ===
-    let long_name = quote["longName"]
-        .as_str()
-        .or_else(|| quote["shortName"].as_str())
-        .unwrap_or(ticker)
-        .trim()
-        .to_string();
-
-    let currency = quote["currency"]
-        .as_str()
-        .unwrap_or("USD")
-        .to_string();
-
-    let exchange_mic = quote["exchange"]
-        .as_str()
-        .unwrap_or("")
-        .to_string();
-
-    if exchange_mic.is_empty() {
-        return Err(anyhow::anyhow!("Missing exchange MIC for ticker {}", ticker));
-    }
-
-    // === 3. Extract ISIN (from assetProfile if available) ===
-    let isin = profile["isin"]
-        .as_str()
-        .and_then(|s| if s.len() == 12 && s.chars().all(|c| c.is_ascii_alphanumeric()) { Some(s) } else { None })
-        .unwrap_or("")
-        .to_ascii_uppercase();
-
-    // === 4. Final sanity check: reject obvious debt securities ===
-    let name_upper = long_name.to_ascii_uppercase();
-    if name_upper.contains(" BOND") ||
-       name_upper.contains(" NOTE") ||
-       name_upper.contains(" DEBENTURE") ||
-       name_upper.contains(" PREFERRED") && !name_upper.contains(" STOCK") {
-        return Err(anyhow::anyhow!("Security name suggests debt instrument: {}", long_name));
-    }
-
-    println!(
-        "      → Valid equity: {} | {} | {} | ISIN: {}",
-        ticker,
-        long_name,
-        exchange_mic,
-        if isin.is_empty() { "N/A" } else { &isin }
-    );
-
-    Ok(PrimaryInfo {
-        isin,
-        name: long_name,
-        exchange_mic,
-        currency,
-    })
-}*/
-
-/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
-///
-/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
-/// reject cookies, and extract the events.
-///
-/// # Arguments
-/// * `ticker` - The stock ticker symbol.
-///
-/// # Returns
-/// A vector of CompanyEvent structs on success.
-///
-/// # Errors
-/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
-pub async fn fetch_earnings_with_pool(
-    ticker: &str,
-    pool: &Arc<ChromeDriverPool>,
-) -> anyhow::Result<Vec<CompanyEvent>> {
-    let ticker = ticker.to_string();
-    let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker);
-
-    let ticker_cloned = ticker.clone();
-
-    pool.execute(url, move |client| {
-        let ticker = ticker_cloned.clone();
-        Box::pin(async move {
-            reject_yahoo_cookies(&client).await?;
-            extract_earnings_events(&client, &ticker).await
-        })
-    }).await
-}
-
-/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
-///
-/// This function assumes the client is already navigated to the correct URL (e.g., 
-/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
-///
-/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
-/// and handles date parsing, float parsing, and optional fields.
-///
-/// # Arguments
-/// * `client` - The fantoccini Client with the page loaded.
-/// * `ticker` - The stock ticker symbol for the events.
-///
-/// # Returns
-/// A vector of CompanyEvent on success.
-///
-/// # Errors
-/// Returns an error if:
-/// - Table or elements not found.
-/// - Date or float parsing fails.
-/// - WebDriver operations fail.
-///
-/// # Examples
-///
-/// ```no_run
-/// use fantoccini::Client;
-/// use crate::corporate::scraper::extract_earnings;
-///
-/// #[tokio::main]
-/// async fn main() -> Result<()> {
-///     // Assume client is set up and navigated
-///     let events = extract_earnings(&client, "AAPL").await?;
-///     Ok(())
-/// }
-/// ```
-pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
-    // Wait for the table to load
-    let table = client
-        .wait()
-        .for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
-        .await
-        .map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
-
-    // Find all rows in tbody
-    let rows = table
-        .find_all(Locator::Css("tbody tr"))
-        .await
-        .map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
-
-    let mut events = Vec::with_capacity(rows.len());
-
-    for row in rows {
-        let cells = row
-            .find_all(Locator::Css("td"))
-            .await
-            .map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
-
-        if cells.len() < 5 {
-            continue; // Skip incomplete rows
-        }
-
-        // Extract and parse date
-        let date_str = cells[0]
-            .text()
-            .await
-            .map_err(|e| anyhow!("Failed to get date text: {}", e))?;
-        let date = parse_yahoo_date(&date_str)
-            .map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
-            .format("%Y-%m-%d")
-            .to_string();
-
-        // Extract time, replace "Time Not Supplied" with empty
-        let time = cells[1]
-            .text()
-            .await
-            .map_err(|e| anyhow!("Failed to get time text: {}", e))?
-            .replace("Time Not Supplied", "");
-
-        // Extract period
-        let period = cells[2]
-            .text()
-            .await
-            .map_err(|e| anyhow!("Failed to get period text: {}", e))?;
-
-        // Parse EPS forecast
-        let eps_forecast_str = cells[3]
-            .text()
-            .await
-            .map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
-        let eps_forecast = parse_float(&eps_forecast_str);
-
-        // Parse EPS actual
-        let eps_actual_str = cells[4]
-            .text()
-            .await
-            .map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
-        let eps_actual = parse_float(&eps_actual_str);
-
-        // Parse surprise % if available
-        let surprise_pct = if cells.len() > 5 {
-            let surprise_str = cells[5]
-                .text()
-                .await
-                .map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
-            parse_float(&surprise_str)
-        } else {
-            None
-        };
-
-        events.push(CompanyEvent {
-            ticker: ticker.to_string(),
-            date,
-            time,
-            period,
-            eps_forecast,
-            eps_actual,
-            revenue_forecast: None,
-            revenue_actual: None,
-            surprise_pct,
-            source: "Yahoo".to_string(),
-        });
-    }
-
-    if events.is_empty() {
-        eprintln!("Warning: No earnings events extracted for ticker {}", ticker);
-    } else {
-        println!("Extracted {} earnings events for {}", events.len(), ticker);
-    }
-
-    Ok(events)
-}
-
 fn parse_price(v: Option<&Value>) -> f64 {
    v.and_then(|x| x.as_str())
        .and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
@@ -490,20 +191,17 @@ pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow
 pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
    let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";

-    // Initialize DataPaths and create cache/gleif directory
    let paths = DataPaths::new(".")?;
    let gleif_cache_dir = paths.cache_gleif_dir();
    
    if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
        let msg = format!("Failed to create cache/gleif directory: {}", e);
        logger::log_error(&msg).await;
-        println!("{}", msg);
        return Ok(None);
    }

-    logger::log_info("Corporate Scraper: Downloading ISIN/LEI mapping from GLEIF...").await;
+    logger::log_info("Downloading ISIN/LEI mapping from GLEIF...").await;

-    // Download ZIP and get the filename from Content-Disposition header
    let client = match reqwest::Client::builder()
        .user_agent(USER_AGENT)
        .timeout(std::time::Duration::from_secs(30))
@@ -511,9 +209,7 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
    {
        Ok(c) => c,
        Err(e) => {
-            let msg = format!("Failed to create HTTP client: {}", e);
-            logger::log_error(&msg).await;
-            println!("{}", msg);
+            logger::log_error(&format!("Failed to create HTTP client: {}", e)).await;
            return Ok(None);
        }
    };
@@ -521,20 +217,15 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
    let resp = match client.get(url).send().await {
        Ok(r) if r.status().is_success() => r,
        Ok(resp) => {
-            let msg = format!("Server returned HTTP {}", resp.status());
-            logger::log_error(&msg).await;
-            println!("{}", msg);
+            logger::log_error(&format!("Server returned HTTP {}", resp.status())).await;
            return Ok(None);
        }
        Err(e) => {
-            let msg = format!("Failed to download ISIN/LEI ZIP: {}", e);
-            logger::log_error(&msg).await;
-            println!("{}", msg);
+            logger::log_error(&format!("Failed to download: {}", e)).await;
            return Ok(None);
        }
    };

-    // Extract filename from Content-Disposition header or use default
    let filename = resp
        .headers()
        .get("content-disposition")
@@ -542,11 +233,10 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
        .and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
        .unwrap_or_else(|| "isin_lei.zip".to_string());

-    // Parse timestamp from filename and convert to DDMMYYYY format
    let parsed_filename = parse_gleif_filename(&filename);
-    logger::log_info(&format!("Corporate Scraper: Downloaded file: {} -> {}", filename, parsed_filename)).await;
+    logger::log_info(&format!("Downloaded: {} -> {}", filename, parsed_filename)).await;

-    // Determine date (DDMMYYYY) from parsed filename: "isin-lei-DDMMYYYY.csv"
+    // Extract date from filename
    let mut date_str = String::new();
    if let Some(start_idx) = parsed_filename.find("isin-lei-") {
        let rest = &parsed_filename[start_idx + 9..];
@@ -555,13 +245,10 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
        }
    }

-    // If we parsed a date, use/create a date folder under cache/gleif and operate inside it; otherwise use cache root.
    let date_dir = if !date_str.is_empty() {
        let p = gleif_cache_dir.join(&date_str);
-        // Ensure the date folder exists (create if necessary)
        if let Err(e) = std::fs::create_dir_all(&p) {
-            let msg = format!("Failed to create date directory {:?}: {}", p, e);
-            logger::log_warn(&msg).await;
+            logger::log_warn(&format!("Failed to create date directory: {}", e)).await;
            None
        } else {
            Some(p)
@@ -570,17 +257,16 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
        None
    };

-    // Choose the directory where we'll look for existing files and where we'll save the new ones
    let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());

-    // If the date folder exists (or was created), prefer any *_clean.csv inside it and return that immediately
+    // Check for existing clean CSV
    if let Some(ref ddir) = date_dir {
        if let Ok(entries) = std::fs::read_dir(ddir) {
            for entry in entries.flatten() {
                if let Some(name) = entry.file_name().to_str() {
                    if name.to_lowercase().ends_with("_clean.csv") {
                        let path = ddir.join(name);
-                        logger::log_info(&format!("Found existing clean GLEIF CSV: {}", path.display())).await;
+                        logger::log_info(&format!("Found existing clean CSV: {}", path.display())).await;
                        return Ok(Some(path.to_string_lossy().to_string()));
                    }
                }
@@ -588,71 +274,42 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
        }
    }

-    // If no clean file found in the date folder (or date folder doesn't exist), check whether the csv/zip already exist in the target dir
-    let csv_candidate_name = parsed_filename.replace(".zip", ".csv");
-    let csv_candidate = target_dir.join(&csv_candidate_name);
-    let zip_candidate = target_dir.join(&parsed_filename);
-
+    let csv_candidate = target_dir.join(parsed_filename.replace(".zip", ".csv"));
    if csv_candidate.exists() {
-        logger::log_info(&format!("Found existing GLEIF CSV: {}", csv_candidate.display())).await;
+        logger::log_info(&format!("Found existing CSV: {}", csv_candidate.display())).await;
        return Ok(Some(csv_candidate.to_string_lossy().to_string()));
    }
-    if zip_candidate.exists() {
-        // If zip exists but csv does not, extract later; for now prefer returning csv path (may be created by extraction step)
-        let inferred_csv = target_dir.join(csv_candidate_name);
-        if inferred_csv.exists() {
-            logger::log_info(&format!("Found existing extracted CSV next to ZIP: {}", inferred_csv.display())).await;
-            return Ok(Some(inferred_csv.to_string_lossy().to_string()));
-        }
-        // otherwise we'll overwrite/extract into target_dir below
-    }

    let bytes = match resp.bytes().await {
        Ok(b) => b,
        Err(e) => {
-            let msg = format!("Failed to read ZIP bytes: {}", e);
-            logger::log_error(&msg).await;
-            println!("{}", msg);
+            logger::log_error(&format!("Failed to read bytes: {}", e)).await;
            return Ok(None);
        }
    };
-    // Ensure target directory exists (create if it's the date folder and was absent earlier)
-    if let Some(ref ddir) = date_dir {
-        let _ = std::fs::create_dir_all(ddir);
-    }

    let zip_path = target_dir.join(&parsed_filename);
    let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));

    if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
-        let msg = format!("Failed to write ZIP file: {}", e);
-        logger::log_error(&msg).await;
-        println!("{}", msg);
+        logger::log_error(&format!("Failed to write ZIP: {}", e)).await;
        return Ok(None);
    }
-    logger::log_info(&format!("Corporate Scraper: Saved ZIP to {:?}", zip_path)).await;

-    // Extract CSV
-    let archive = match std::fs::File::open(&zip_path)
-        .map(ZipArchive::new)
-    {
+    // Extract CSV from ZIP
+    let archive = match std::fs::File::open(&zip_path).map(ZipArchive::new) {
        Ok(Ok(a)) => a,
        Ok(Err(e)) => {
-            let msg = format!("Invalid ZIP: {}", e);
-            logger::log_error(&msg).await;
-            println!("{}", msg);
+            logger::log_error(&format!("Invalid ZIP: {}", e)).await;
            return Ok(None);
        }
        Err(e) => {
-            let msg = format!("Cannot open ZIP file: {}", e);
-            logger::log_error(&msg).await;
-            println!("{}", msg);
+            logger::log_error(&format!("Cannot open ZIP: {}", e)).await;
            return Ok(None);
        }
    };

    let mut archive = archive;
-
    let idx = match (0..archive.len()).find(|&i| {
        archive.by_index(i)
            .map(|f| f.name().ends_with(".csv"))
@@ -660,9 +317,7 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
    }) {
        Some(i) => i,
        None => {
-            let msg = "ZIP did not contain a CSV file";
-            logger::log_error(msg).await;
-            println!("{}", msg);
+            logger::log_error("ZIP contains no CSV").await;
            return Ok(None);
        }
    };
@@ -670,43 +325,32 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
    let mut csv_file = match archive.by_index(idx) {
        Ok(f) => f,
        Err(e) => {
-            let msg = format!("Failed to read CSV entry: {}", e);
-            logger::log_error(&msg).await;
-            println!("{}", msg);
+            logger::log_error(&format!("Failed to read CSV: {}", e)).await;
            return Ok(None);
        }
    };

    let mut csv_bytes = Vec::new();
    if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
-        let msg = format!("Failed to extract CSV: {}", e);
-        logger::log_error(&msg).await;
+        logger::log_error(&format!("Failed to extract: {}", e)).await;
        return Ok(None);
    }

    if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
-        let msg = format!("Failed to save CSV file: {}", e);
-        logger::log_error(&msg).await;
+        logger::log_error(&format!("Failed to save CSV: {}", e)).await;
        return Ok(None);
    }

-    let msg = format!("✓ ISIN/LEI CSV extracted: {:?}", csv_path);
-    logger::log_info(&msg).await;
-    
+    logger::log_info(&format!("✓ CSV extracted: {:?}", csv_path)).await;
    Ok(Some(csv_path.to_string_lossy().to_string()))
 }

-/// Parse GLEIF filename and convert timestamp to DDMMYYYY format
-/// Example: "isin-lei-20251124T080254.csv" -> "isin-lei-24112025.csv"
 fn parse_gleif_filename(filename: &str) -> String {
-    // Try to find pattern: isin-lei-YYYYMMDDTHHMMSS.zip/csv
    if let Some(start_idx) = filename.find("isin-lei-") {
-        let rest = &filename[start_idx + 9..]; // After "isin-lei-"
+        let rest = &filename[start_idx + 9..];
        
-        // Extract the 8 digits (YYYYMMDD)
        if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
            let date_part = &rest[0..8];
-            // date_part is YYYYMMDD, convert to DDMMYYYY
            if date_part.len() == 8 {
                let year = &date_part[0..4];
                let month = &date_part[4..6];
@@ -717,11 +361,9 @@ fn parse_gleif_filename(filename: &str) -> String {
        }
    }
    
-    // Fallback: return original filename if parsing fails
    filename.to_string()
 }

-
 pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
    // 1. Download + extract the CSV (this is now async)
    let csv_path = match download_isin_lei_csv().await? {
@@ -769,30 +411,4 @@ pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>>
    );

    Ok(map)
-}
-
-pub async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
-    for _ in 0..10 {
-        let clicked: bool = client
-            .execute(
-                r#"(() => {
-                    const btn = document.querySelector('#consent-page .reject-all');
-                    if (btn) {
-                        btn.click();
-                        return true;
-                    }
-                    return false;
-                })()"#,
-                vec![],
-            )
-            .await?
-            .as_bool()
-            .unwrap_or(false);
-
-        if clicked { break; }
-        sleep(TokioDuration::from_millis(500)).await;
-    }
-
-    println!("Rejected Yahoo cookies if button existed");
-    Ok(())
 }