added creating CompanyInfo mapping

This commit is contained in:
2025-12-04 13:33:32 +01:00
parent 95fd9ca141
commit ef2393ab70
13 changed files with 965 additions and 696 deletions

View File

@@ -8,5 +8,4 @@ pub mod aggregation;
pub mod fx;
pub mod openfigi;
pub use types::*;
pub use update::run_full_update;

View File

@@ -287,84 +287,6 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
Ok(lei_to_figis)
}
/// Loads or builds the LEI-to-FigiInfo map, filtering for equities via OpenFIGI.
///
/// Attempts to load from "data/companies_by_lei/lei_to_figi.jsonl" (JSON Lines format, one LEI entry per line).
/// For any missing LEIs (compared to `lei_to_isins`), fetches their FigiInfos and appends
/// to the .jsonl file incrementally. This allows resumption after interruptions: on restart,
/// already processed LEIs are skipped, and only missing ones are fetched.
///
/// If no API key is present, skips building and returns the loaded map (possibly partial).
///
/// # Arguments
///
/// * `lei_to_isins` - HashMap of LEI to Vec<ISIN> (used for building missing entries).
///
/// # Returns
///
/// The complete (or partial if interrupted) HashMap<LEI, Vec<FigiInfo>>.
///
/// # Errors
///
/// Returns an error if file I/O fails, JSON serialization/deserialization fails,
/// or if OpenFIGI queries fail during building.
pub async fn load_or_build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
let data_dir = Path::new("data");
tokio_fs::create_dir_all(data_dir).await.context("Failed to create data directory")?;
let path = data_dir.join("lei_to_figi.jsonl");
let mut lei_to_figis: HashMap<String, Vec<FigiInfo>> = load_lei_to_figi_jsonl(&path)?;
let client = OpenFigiClient::new()?;
if !client.has_key {
println!("No API key—using partial LEI→FIGI map with {} entries", lei_to_figis.len());
return Ok(lei_to_figis);
}
// Sort LEIs for deterministic processing order
let mut leis: Vec<_> = lei_to_isins.keys().cloned().collect();
leis.sort();
let mut processed = lei_to_figis.len();
let total = leis.len();
for lei in leis {
if lei_to_figis.contains_key(&lei) {
continue; // Skip already processed
}
let isins = match lei_to_isins.get(&lei) {
Some(i) => i,
None => continue,
};
let unique_isins: Vec<_> = isins.iter().cloned().collect::<HashSet<_>>().into_iter().collect();
let equity_figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
let mut figis = equity_figi_infos;
if !figis.is_empty() {
figis.sort_by_key(|f| f.figi.clone());
figis.dedup_by_key(|f| f.figi.clone());
}
// Append to .jsonl
append_lei_to_figi_jsonl(&path, &lei, &figis)?;
// Insert into in-memory map (optional, but useful for return value)
lei_to_figis.insert(lei.clone(), figis);
processed += 1;
if processed % 100 == 0 {
println!("Processed {}/{} LEIs → {} total equity FIGIs", processed, total, lei_to_figis.values().map(|v| v.len()).sum::<usize>());
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
println!("Completed LEI→FIGI map: {} mappings (equity-only)", lei_to_figis.len());
Ok(lei_to_figis)
}
/// Loads LEI-to-FigiInfo map from a JSON Lines file.
///
/// Each line is expected to be a JSON object: {"lei": "ABC", "figis": [FigiInfo...]}
@@ -436,60 +358,396 @@ fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyho
Ok(())
}
/// Loads or builds a list of CompanyMetadata using LEI as the primary key.
/// Loads or builds a HashMap of CompanyInfo objects indexed by company name.
///
/// Attempts to load pre-built company metadata from "data/companies_by_lei/companies_lei.json".
/// If the cache does not exist, builds the metadata by first obtaining the LEI-to-FigiInfo map
/// (loading or fetching via OpenFIGI if necessary), then constructs CompanyMetadata for each LEI.
/// This function:
/// 1. Attempts to load existing companies from cache
/// 2. If cache exists, updates/extends it with new data from figi_to_lei
/// 3. If no cache exists, creates a new HashMap from scratch
/// 4. Saves the result back to cache
///
/// Only includes LEIs that have associated ISINs from the input map. If no FigiInfos are available
/// for a LEI (e.g., no equity listings), the `figi` field will be `None`.
/// For existing entries (matched by name):
/// - Merges securities lists (deduplicates by FIGI)
/// - Updates primary_isin if the existing one is empty or not in the securities list
///
/// For new entries:
/// - Adds them to the HashMap
///
/// Companies with no FigiInfo data are skipped.
/// The resulting HashMap is saved to `data/companies_by_name/companies.json`.
///
/// # Arguments
///
/// * `lei_to_isins` - Mapping of LEI to associated ISINs (used for building the FigiInfo map if needed).
/// * `figi_to_lei` - HashMap mapping LEI to Vec<FigiInfo>.
///
/// # Returns
///
/// A vector of `CompanyMetadata` structs, sorted by LEI.
/// A HashMap mapping company name to CompanyInfo.
///
/// # Errors
/// Returns an error if file I/O fails or JSON serialization fails.
pub async fn load_or_build_companies_by_name(
figi_to_lei: &HashMap<String, Vec<FigiInfo>>
) -> anyhow::Result<HashMap<String, CompanyInfo>> {
// Try to load existing cache
let mut companies_by_name = match load_companies_by_name_internal().await? {
Some(existing) => {
println!("Loaded {} existing companies from cache", existing.len());
existing
},
None => {
println!("No existing cache found, creating new companies HashMap");
HashMap::new()
}
};
let initial_count = companies_by_name.len();
let mut added_count = 0;
let mut updated_count = 0;
println!("Processing {} LEI entries from FIGI data...", figi_to_lei.len());
for (lei, figi_infos) in figi_to_lei.iter() {
// Skip entries with no FigiInfo data
if figi_infos.is_empty() {
continue;
}
// Get company name from first FigiInfo entry
let name = figi_infos[0].name.clone();
if name.is_empty() {
continue;
}
// Check if company already exists
if let Some(existing_company) = companies_by_name.get_mut(&name) {
// Update existing entry
let merged_securities = merge_securities(&existing_company.securities, figi_infos);
let securities_added = merged_securities.len() - existing_company.securities.len();
if securities_added > 0 {
existing_company.securities = merged_securities;
// Update primary_isin if needed
if existing_company.primary_isin.is_empty() ||
!existing_company.securities.iter().any(|s| s.isin == existing_company.primary_isin) {
existing_company.primary_isin = existing_company.securities[0].isin.clone();
}
updated_count += 1;
}
} else {
// Add new entry
let primary_isin = figi_infos[0].isin.clone();
let securities = figi_infos.clone();
let company_info = CompanyInfo {
name: name.clone(),
primary_isin,
securities,
};
companies_by_name.insert(name, company_info);
added_count += 1;
}
}
println!(" Companies statistics:");
println!(" - Initial: {}", initial_count);
println!(" - Added: {}", added_count);
println!(" - Updated: {}", updated_count);
println!(" - Total: {}", companies_by_name.len());
// Save to JSON
save_companies_by_name(&companies_by_name).await?;
Ok(companies_by_name)
}
/// Merges two lists of FigiInfo, deduplicating by FIGI.
///
/// Returns an error if file I/O fails, JSON serialization/deserialization fails,
/// or if building the LEI-to-FigiInfo map encounters issues (e.g., API errors).
pub async fn load_or_build_companies_lei(
lei_to_isins: &HashMap<String, Vec<String>>,
) -> anyhow::Result<Vec<CompanyMetadata>> {
let cache_path = Path::new("data/companies_by_lei/companies_lei.json");
if cache_path.exists() {
let content = tokio_fs::read_to_string(cache_path).await.context("Failed to read companies cache")?;
let mut companies: Vec<CompanyMetadata> = serde_json::from_str(&content).context("Failed to parse companies JSON")?;
companies.sort_by_key(|c| c.lei.clone());
println!("Loaded {} LEI-keyed companies from cache.", companies.len());
return Ok(companies);
/// # Arguments
/// * `existing` - Existing securities list
/// * `new_securities` - New securities to merge
///
/// # Returns
/// Merged and deduplicated list of FigiInfo
fn merge_securities(existing: &[FigiInfo], new_securities: &[FigiInfo]) -> Vec<FigiInfo> {
let mut merged = existing.to_vec();
let existing_figis: HashSet<String> = existing.iter()
.map(|f| f.figi.clone())
.collect();
for new_sec in new_securities {
if !existing_figis.contains(&new_sec.figi) {
merged.push(new_sec.clone());
}
}
// Sort by FIGI for consistency
merged.sort_by(|a, b| a.figi.cmp(&b.figi));
merged
}
// Build or load the LEI-to-FigiInfo map (with incremental persistence)
let lei_to_figi = load_or_build_lei_to_figi_infos(lei_to_isins).await?;
// Build companies from all LEIs in lei_to_isins (even if no FigiInfos)
let mut companies = Vec::new();
for lei in lei_to_isins.keys() {
let figis = lei_to_figi.get(lei).cloned();
companies.push(CompanyMetadata {
lei: lei.clone(),
figi: figis.and_then(|v| if v.is_empty() { None } else { Some(v) }),
});
/// Internal function to load the companies HashMap from cache.
///
/// # Returns
/// Some(HashMap) if the cache file exists and is valid, None otherwise.
///
/// # Errors
/// Returns an error if file I/O fails or JSON parsing fails.
async fn load_companies_by_name_internal() -> anyhow::Result<Option<HashMap<String, CompanyInfo>>> {
let cache_file = Path::new("data/companies_by_name/companies.json");
if !cache_file.exists() {
return Ok(None);
}
let content = tokio_fs::read_to_string(cache_file).await
.context("Failed to read companies.json")?;
let companies: HashMap<String, CompanyInfo> = serde_json::from_str(&content)
.context("Failed to parse companies.json")?;
Ok(Some(companies))
}
companies.sort_by_key(|c| c.lei.clone());
/// Saves the companies HashMap to cache.
///
/// # Arguments
/// * `companies` - The companies HashMap to save
///
/// # Errors
/// Returns an error if file I/O fails or JSON serialization fails.
async fn save_companies_by_name(companies: &HashMap<String, CompanyInfo>) -> anyhow::Result<()> {
let cache_dir = Path::new("data/companies_by_name");
tokio_fs::create_dir_all(cache_dir).await
.context("Failed to create data/companies_by_name directory")?;
let cache_file = cache_dir.join("companies.json");
let json_str = serde_json::to_string_pretty(&companies)
.context("Failed to serialize companies to JSON")?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write companies.json")?;
println!(" ✓ Saved {} companies to {}", companies.len(), cache_file.display());
Ok(())
}
// Cache the result
let data_dir = Path::new("data");
tokio_fs::create_dir_all(data_dir).await.context("Failed to create data directory")?;
tokio_fs::write(cache_path, serde_json::to_string_pretty(&companies)?).await.context("Failed to write companies cache")?;
/// Loads all OpenFIGI mapping value lists (marketSecDes, micCode, securityType).
///
/// This function fetches the available values for each mapping parameter from the OpenFIGI API
/// and caches them as JSON files in `data/openfigi/`. If the files already exist and are recent
/// (less than 30 days old), they are reused instead of re-fetching.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if API requests fail, JSON parsing fails, or file I/O fails.
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
println!("Loading OpenFIGI mapping value lists...");
let client = OpenFigiClient::new()?;
// Create cache directory
let cache_dir = Path::new("data/openfigi");
tokio_fs::create_dir_all(cache_dir).await
.context("Failed to create data/openfigi directory")?;
// Fetch each type list
get_figi_market_sec_des(&client, cache_dir).await?;
get_figi_mic_code(&client, cache_dir).await?;
get_figi_security_type(&client, cache_dir).await?;
println!("OpenFIGI mapping value lists loaded successfully");
Ok(())
}
println!("Built and cached {} LEI-keyed companies.", companies.len());
Ok(companies)
/// Fetches and caches the list of valid marketSecDes values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_market_sec_des(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("marketSecDes.json");
// Check if cache exists and is recent (< 30 days old)
if should_use_cache(&cache_file).await? {
println!(" Using cached marketSecDes values");
return Ok(());
}
println!(" Fetching marketSecDes values from OpenFIGI API...");
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/marketSecDes")
.send()
.await
.context("Failed to fetch marketSecDes values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse marketSecDes response")?;
// Save to cache
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write marketSecDes cache")?;
println!(" ✓ Cached marketSecDes values");
// Respect rate limits
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Fetches and caches the list of valid micCode values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_mic_code(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("micCode.json");
if should_use_cache(&cache_file).await? {
println!(" Using cached micCode values");
return Ok(());
}
println!(" Fetching micCode values from OpenFIGI API...");
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/micCode")
.send()
.await
.context("Failed to fetch micCode values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse micCode response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write micCode cache")?;
println!(" ✓ Cached micCode values");
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Fetches and caches the list of valid securityType values.
///
/// # Arguments
/// * `client` - The OpenFIGI client instance.
/// * `cache_dir` - Directory to save the cached JSON file.
///
/// # Returns
/// Ok(()) on success.
///
/// # Errors
/// Returns an error if the API request fails or file I/O fails.
async fn get_figi_security_type(client: &OpenFigiClient, cache_dir: &Path) -> anyhow::Result<()> {
let cache_file = cache_dir.join("securityType.json");
if should_use_cache(&cache_file).await? {
println!(" Using cached securityType values");
return Ok(());
}
println!(" Fetching securityType values from OpenFIGI API...");
let resp = client.client
.get("https://api.openfigi.com/v3/mapping/values/securityType")
.send()
.await
.context("Failed to fetch securityType values")?;
handle_rate_limit(&resp).await?;
let values: Value = resp.json().await
.context("Failed to parse securityType response")?;
let json_str = serde_json::to_string_pretty(&values)?;
tokio_fs::write(&cache_file, json_str).await
.context("Failed to write securityType cache")?;
println!(" ✓ Cached securityType values");
sleep(Duration::from_millis(if client.has_key { 240 } else { 2400 })).await;
Ok(())
}
/// Checks if a cache file exists and is less than 30 days old.
///
/// # Arguments
/// * `path` - Path to the cache file.
///
/// # Returns
/// True if the cache should be used, false if it needs refreshing.
async fn should_use_cache(path: &Path) -> anyhow::Result<bool> {
if !path.exists() {
return Ok(false);
}
let metadata = tokio_fs::metadata(path).await?;
let modified = metadata.modified()?;
let age = modified.elapsed().unwrap_or(std::time::Duration::from_secs(u64::MAX));
// Cache is valid for 30 days
Ok(age < std::time::Duration::from_secs(30 * 24 * 60 * 60))
}
/// Handles rate limit responses from the OpenFIGI API.
///
/// If a 429 status is received, this function sleeps for the duration specified
/// in the `ratelimit-reset` header (or 10 seconds by default).
///
/// # Arguments
/// * `resp` - The HTTP response to check.
///
/// # Returns
/// Ok(()) if no rate limit, or after waiting for the reset period.
///
/// # Errors
/// Returns an error if the response status indicates a non-rate-limit error.
async fn handle_rate_limit(resp: &reqwest::Response) -> anyhow::Result<()> {
let status = resp.status();
if status == 429 {
let headers = resp.headers();
let reset_sec = headers
.get("ratelimit-reset")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(10);
println!(" Rate limited—waiting {}s", reset_sec);
sleep(std::time::Duration::from_secs(reset_sec.max(10))).await;
return Err(anyhow!("Rate limited, please retry"));
} else if status.is_client_error() || status.is_server_error() {
return Err(anyhow!("OpenFIGI API error: {}", status));
}
Ok(())
}

View File

@@ -1,18 +1,17 @@
use crate::corporate::openfigi::OpenFigiClient;
// src/corporate/scraper.rs
use super::{types::*, helpers::*};
use csv::ReaderBuilder;
use super::{types::*, helpers::*, openfigi::*};
//use crate::corporate::openfigi::OpenFigiClient;
use crate::{scraper::webdriver::*};
use fantoccini::{Client, Locator};
use scraper::{Html, Selector};
use chrono::{DateTime, Duration, NaiveDate, Timelike, Utc};
use chrono::{DateTime, Duration, NaiveDate, Utc};
use tokio::{time::{Duration as TokioDuration, sleep}};
use reqwest::Client as HttpClient;
use serde_json::{json, Value};
use zip::ZipArchive;
use std::fs::File;
use std::{collections::HashMap};
use std::io::{Read, BufReader};
use std::{collections::HashMap, sync::Arc};
use std::io::{Read};
use anyhow::{anyhow, Result};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
@@ -333,28 +332,6 @@ fn exchange_name_to_mic(name: &str) -> String {
}.to_string()
}
pub async fn dismiss_yahoo_consent(client: &Client) -> anyhow::Result<()> {
let script = r#"
(() => {
const agree = document.querySelector('button[name="agree"]');
if (agree) {
agree.click();
return true;
}
return false;
})()
"#;
for _ in 0..10 {
let done: bool = client.execute(script, vec![]).await?.as_bool().unwrap_or(false);
if done {
break;
}
sleep(TokioDuration::from_millis(500)).await;
}
Ok(())
}
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
///
/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
@@ -368,74 +345,137 @@ pub async fn dismiss_yahoo_consent(client: &Client) -> anyhow::Result<()> {
///
/// # Errors
/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
pub async fn get_earnings_events_task(ticker: &str) -> anyhow::Result<Vec<CompanyEvent>> {
pub async fn fetch_earnings_with_pool(
ticker: &str,
pool: &Arc<ChromeDriverPool>,
) -> anyhow::Result<Vec<CompanyEvent>> {
let ticker = ticker.to_string();
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker);
let task: ScrapeTask<Vec<CompanyEvent>> = ScrapeTask::new(
url,
|client| Box::pin(async move {
reject_yahoo_cookies(client).await?;
extract_earnings(client).await // Assuming extract_earnings is an async fn that uses client
}),
);
task.execute().await
let ticker_cloned = ticker.clone();
pool.execute(url, move |client| {
let ticker = ticker_cloned.clone();
Box::pin(async move {
reject_yahoo_cookies(&client).await?;
extract_earnings_events(&client, &ticker).await
})
}).await
}
pub async fn fetch_earnings_history(client: &Client, ticker: &str) -> anyhow::Result<Vec<CompanyEvent>> {
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}&offset=0&size=100", ticker);
client.goto(&url).await?;
dismiss_yahoo_consent(client).await?;
/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
///
/// This function assumes the client is already navigated to the correct URL (e.g.,
/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
///
/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
/// and handles date parsing, float parsing, and optional fields.
///
/// # Arguments
/// * `client` - The fantoccini Client with the page loaded.
/// * `ticker` - The stock ticker symbol for the events.
///
/// # Returns
/// A vector of CompanyEvent on success.
///
/// # Errors
/// Returns an error if:
/// - Table or elements not found.
/// - Date or float parsing fails.
/// - WebDriver operations fail.
///
/// # Examples
///
/// ```no_run
/// use fantoccini::Client;
/// use crate::corporate::scraper::extract_earnings;
///
/// #[tokio::main]
/// async fn main() -> Result<()> {
/// // Assume client is set up and navigated
/// let events = extract_earnings(&client, "AAPL").await?;
/// Ok(())
/// }
/// ```
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
// Wait for the table to load
let table = client
.wait()
.for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
.await
.map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
loop {
match client.find(Locator::XPath(r#"//button[contains(text(), 'Show More')]"#)).await {
Ok(btn) => {
btn.click().await?;
sleep(TokioDuration::from_secs(2)).await;
}
Err(_) => break,
// Find all rows in tbody
let rows = table
.find_all(Locator::Css("tbody tr"))
.await
.map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
let mut events = Vec::with_capacity(rows.len());
for row in rows {
let cells = row
.find_all(Locator::Css("td"))
.await
.map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
if cells.len() < 5 {
continue; // Skip incomplete rows
}
}
let html = client.source().await?;
let document = Html::parse_document(&html);
let row_sel = Selector::parse("table tbody tr").unwrap();
let mut events = Vec::new();
// Extract and parse date
let date_str = cells[0]
.text()
.await
.map_err(|e| anyhow!("Failed to get date text: {}", e))?;
let date = parse_yahoo_date(&date_str)
.map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
.format("%Y-%m-%d")
.to_string();
for row in document.select(&row_sel) {
let cols: Vec<String> = row.select(&Selector::parse("td").unwrap())
.map(|td| td.text().collect::<Vec<_>>().join(" ").trim().to_string())
.collect();
if cols.len() < 6 { continue; }
// Extract time, replace "Time Not Supplied" with empty
let time = cells[1]
.text()
.await
.map_err(|e| anyhow!("Failed to get time text: {}", e))?
.replace("Time Not Supplied", "");
let full_date = &cols[2];
let parts: Vec<&str> = full_date.split(" at ").collect();
let raw_date = parts[0].trim();
let time_str = if parts.len() > 1 { parts[1].trim() } else { "" };
// Extract period
let period = cells[2]
.text()
.await
.map_err(|e| anyhow!("Failed to get period text: {}", e))?;
let date = match parse_yahoo_date(raw_date) {
Ok(d) => d,
Err(_) => continue,
};
// Parse EPS forecast
let eps_forecast_str = cells[3]
.text()
.await
.map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
let eps_forecast = parse_float(&eps_forecast_str);
let eps_forecast = parse_float(&cols[3]);
let eps_actual = if cols[4] == "-" { None } else { parse_float(&cols[4]) };
// Parse EPS actual
let eps_actual_str = cells[4]
.text()
.await
.map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
let eps_actual = parse_float(&eps_actual_str);
let surprise_pct = if let (Some(f), Some(a)) = (eps_forecast, eps_actual) {
if f.abs() > 0.001 { Some((a - f) / f.abs() * 100.0) } else { None }
} else { None };
let time = if time_str.contains("PM") {
"AMC".to_string()
} else if time_str.contains("AM") {
"BMO".to_string()
// Parse surprise % if available
let surprise_pct = if cells.len() > 5 {
let surprise_str = cells[5]
.text()
.await
.map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
parse_float(&surprise_str)
} else {
"".to_string()
None
};
events.push(CompanyEvent {
ticker: ticker.to_string(),
date: date.format("%Y-%m-%d").to_string(),
date,
time,
period: "".to_string(),
period,
eps_forecast,
eps_actual,
revenue_forecast: None,
@@ -445,6 +485,12 @@ pub async fn fetch_earnings_history(client: &Client, ticker: &str) -> anyhow::Re
});
}
if events.is_empty() {
eprintln!("Warning: No earnings events extracted for ticker {}", ticker);
} else {
println!("Extracted {} earnings events for {}", events.len(), ticker);
}
Ok(events)
}
@@ -768,57 +814,6 @@ pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>>
Ok(map)
}
pub async fn get_primary_isin_and_name(
client: &Client, // Pass your existing Selenium client
ticker: &str,
) -> anyhow::Result<PrimaryInfo> {
// Navigate to the actual quote page (always works)
let quote_url = format!("https://finance.yahoo.com/quote/{}", ticker);
client.goto(&quote_url).await?;
// Dismiss overlays/banners (your function + guce-specific)
reject_yahoo_cookies(client).await?;
// Wait for page to load (key data elements)
sleep(TokioDuration::from_millis(2000)).await;
// Get page HTML and parse
let html = client.source().await?;
let document = Html::parse_document(&html);
// Selectors for key fields (tested on real Yahoo pages Nov 2025)
let name_sel = Selector::parse("h1[data-testid='qsp-price-header']").unwrap_or_else(|_| Selector::parse("h1").unwrap());
let isin_sel = Selector::parse("[data-testid='qsp-symbol'] + div [data-field='isin']").unwrap_or_else(|_| Selector::parse("[data-field='isin']").unwrap());
let exchange_sel = Selector::parse("[data-testid='qsp-market'] span").unwrap_or_else(|_| Selector::parse(".TopNav__Exchange").unwrap());
let currency_sel = Selector::parse("[data-testid='qsp-price'] span:contains('USD')").unwrap_or_else(|_| Selector::parse(".TopNav__Currency").unwrap()); // Adjust for dynamic
let name_elem = document.select(&name_sel).next().map(|e| e.text().collect::<String>().trim().to_string());
let isin_elem = document.select(&isin_sel).next().map(|e| e.text().collect::<String>().trim().to_uppercase());
let exchange_elem = document.select(&exchange_sel).next().map(|e| e.text().collect::<String>().trim().to_string());
let currency_elem = document.select(&currency_sel).next().map(|e| e.text().collect::<String>().trim().to_string());
let name = name_elem.unwrap_or_else(|| ticker.to_string());
let isin = isin_elem.unwrap_or_default();
let exchange_mic = exchange_elem.unwrap_or_default();
let currency = currency_elem.unwrap_or_else(|| "USD".to_string());
// Validate ISIN
let valid_isin = if isin.len() == 12 && isin.chars().all(|c| c.is_alphanumeric()) {
isin
} else {
"".to_string()
};
println!(" → Scraped {}: {} | ISIN: {} | Exchange: {}", ticker, name, valid_isin, exchange_mic);
Ok(PrimaryInfo {
isin: valid_isin,
name,
exchange_mic,
currency,
})
}
pub async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
for _ in 0..10 {
let clicked: bool = client

View File

@@ -1,5 +1,5 @@
// src/corporate/storage.rs
use super::{types::*, helpers::*, scraper::get_primary_isin_and_name};
use super::{types::*, helpers::*};
use crate::config;
use tokio::fs;
@@ -102,17 +102,6 @@ pub async fn save_prices_for_ticker(ticker: &str, timeframe: &str, mut prices: V
Ok(())
}
pub async fn _load_companies() -> Result<Vec<CompanyMetadata>, anyhow::Error> {
let path = Path::new("src/data/companies.json");
if !path.exists() {
println!("Missing companies.json file at src/data/companies.json");
return Ok(vec![]);
}
let content = fs::read_to_string(path).await?;
let companies: Vec<CompanyMetadata> = serde_json::from_str(&content)?;
Ok(companies)
}
pub fn get_company_dir(lei: &str) -> PathBuf {
PathBuf::from("corporate_prices").join(lei)
}
@@ -132,20 +121,6 @@ pub async fn ensure_company_dirs(isin: &str) -> anyhow::Result<()> {
Ok(())
}
pub async fn save_company_metadata(company: &CompanyMetadata) -> anyhow::Result<()> {
let dir = get_company_dir(&company.lei);
fs::create_dir_all(&dir).await?;
let path = dir.join("metadata.json");
fs::write(&path, serde_json::to_string_pretty(company)?).await?;
Ok(())
}
pub async fn load_company_metadata(lei: &str) -> anyhow::Result<CompanyMetadata> {
let path = get_company_dir(lei).join("metadata.json");
let content = fs::read_to_string(path).await?;
Ok(serde_json::from_str(&content)?)
}
pub async fn save_available_exchanges(isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
let dir = get_company_dir(isin);
fs::create_dir_all(&dir).await?;

View File

@@ -1,35 +1,33 @@
// src/corporate/update.rs
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
use crate::config::Config;
use crate::scraper::webdriver::ChromeDriverPool;
use chrono::Local;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use futures::{stream::{self, StreamExt}};
/// Hauptfunktion: Vollständiger Update-Durchlauf für alle Unternehmen (LEI-basiert)
/// Main function: Full update for all companies (LEI-based) with optimized parallel execution.
///
/// Diese Funktion koordiniert den gesamten Update-Prozess:
/// - Lädt GLEIF-Mappings
/// - Baut FIGI-LEI-Map
/// - Lädt bestehende Events
/// - Verarbeitet jede Company: Ergänzt ISINs (abgeleitet aus FIGI), entdeckt Exchanges via FIGI,
/// holt Prices & Earnings, aggregiert Daten
/// - Speichert optimierte Events
/// This function coordinates the entire update process:
/// - Loads GLEIF mappings
/// - Builds FIGI-LEI map
/// - Loads existing events
/// - Processes each company: discovers exchanges via FIGI, fetches prices & earnings, aggregates data
/// - Uses the provided shared ChromeDriver pool for efficient parallel scraping
/// - Saves optimized events
///
/// # Arguments
/// * `config` - Konfiguration mit Startdaten etc.
///
/// # Returns
/// `Ok(())` bei Erfolg, sonst `anyhow::Error` mit Kontext.
/// * `config` - The application configuration.
/// * `pool` - Shared pool of ChromeDriver instances for scraping.
///
/// # Errors
/// - Mapping-Laden fehlschlägt (Warning, fährt mit leer fort)
/// - Company-Laden/Bauen fehlschlägt
/// - Directory Creation oder Speichern fehlschlägt
/// - Discovery/Fetch/Aggregation pro Company fehlschlägt (fortgesetzt bei Fehlern, mit Log)
pub async fn run_full_update(config: &Config) -> anyhow::Result<()> {
/// Returns an error if any step in the update process fails.
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
println!("=== Starting LEI-based corporate full update ===");
// 1. Frisches GLEIF ISIN ↔ LEI Mapping laden (jeder Lauf neu)
// 1. Load fresh GLEIF ISIN ↔ LEI mapping
let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
Ok(map) => map,
Err(e) => {
@@ -38,8 +36,16 @@ pub async fn run_full_update(config: &Config) -> anyhow::Result<()> {
}
};
// 2. FIGI → LEI Map (optional, nur mit API-Key sinnvoll)
let figi_to_lei= match build_lei_to_figi_infos(&lei_to_isins).await {
// 2. Load OpenFIGI mapping value lists (cached)
if let Err(e) = load_figi_type_lists().await {
eprintln!("Warning: Could not load OpenFIGI type lists: {}", e);
}
// 3. Build FIGI → LEI map
// # Attributes
// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
// * figi: metadata with ISIN as key
let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins).await {
Ok(map) => map,
Err(e) => {
eprintln!("Warning: Could not build FIGI→LEI map: {}", e);
@@ -47,7 +53,11 @@ pub async fn run_full_update(config: &Config) -> anyhow::Result<()> {
}
};
// 3. Bestehende Earnings-Events laden (für Change-Detection)
// 4. Load or build companies
let mut companies = load_or_build_companies_by_name(&figi_to_lei).await?;
println!("Processing {} companies", companies.len());
// 5. Load existing earnings events (for change detection)
let today = Local::now().format("%Y-%m-%d").to_string();
let mut existing_events = match load_existing_events().await {
Ok(events) => events,
@@ -57,162 +67,47 @@ pub async fn run_full_update(config: &Config) -> anyhow::Result<()> {
}
};
// 4. Unternehmen laden / neu aufbauen (LEI + FIGI-Infos)
let mut companies: Vec<CompanyMetadata> = load_or_build_companies_lei(&lei_to_isins).await?;
// 5. Use the provided pool (no need to create a new one)
let pool_size = pool.get_number_of_instances(); // Use the size from the shared pool
// 4.1 LEIs anreichern (falls missing, über bekannte ISINs aus FIGI suchen)
//enrich_companies_with_leis(&mut companies, &lei_to_isins).await?;
// 5. Haupt-Loop: Jedes Unternehmen verarbeiten
for company in companies.iter_mut() {
let lei = &company.lei;
let figi_infos = company.figi.as_ref().map_or(&[][..], |v| &v[..]);
let name = figi_infos.first().map(|f| f.name.as_str()).unwrap_or("Unknown");
println!("\nProcessing company: {} (LEI: {})", name, lei);
// --- 5.1 Alle bekannten ISINs aus GLEIF ergänzen ---
let mut all_isins = lei_to_isins.get(lei).cloned().unwrap_or_default();
let figi_isins: Vec<String> = figi_infos.iter().map(|f| f.isin.clone()).collect::<HashSet<_>>().into_iter().collect();
all_isins.extend(figi_isins);
all_isins.sort();
all_isins.dedup(); // Unique ISINs
// --- 5.2 Verzeichnisstruktur anlegen & Metadaten speichern ---
ensure_company_dirs(lei).await?;
save_company_metadata(company).await?;
// --- 5.3 FIGI-Infos ermitteln (falls noch nicht vorhanden) ---
let figi_infos = company.figi.get_or_insert_with(Vec::new);
if figi_infos.is_empty() {
println!(" No FIGI data yet → discovering exchanges via first known ISIN");
let first_isin = all_isins.first().cloned().unwrap_or_default();
if !first_isin.is_empty() {
match discover_available_exchanges(&first_isin, "").await {
Ok(discovered) => {
figi_infos.extend(discovered);
println!(" Discovered {} exchange(s) for first ISIN", figi_infos.len());
}
Err(e) => eprintln!(" Discovery failed for first ISIN: {}", e),
}
// Process companies in parallel using the shared pool
/*let results: Vec<_> = stream::iter(companies.into_iter())
.map(|company| {
let pool_clone = pool.clone();
async move {
process_company_data(&company, &pool_clone, &mut existing_events).await
}
} else {
println!(" {} exchange(s) already known", figi_infos.len());
})
.buffer_unordered(pool_size)
.collect().await;
// Handle results (e.g., collect changes)
let mut all_changes = Vec::new();
for result in results {
if let Ok(ProcessResult { changes }) = result {
all_changes.extend(changes);
}
}*/
// --- 5.4 Weitere Exchanges über alle ISINs suchen ---
let mut new_discovered = 0;
for isin in &all_isins {
if figi_infos.iter().any(|f| f.isin == *isin) {
continue; // Schon bekannt
}
println!(" Discovering additional exchanges for ISIN {}", isin);
match discover_available_exchanges(isin, "").await {
Ok(mut found) => {
for info in found.drain(..) {
if !figi_infos.iter().any(|f| f.ticker == info.ticker && f.mic_code == info.mic_code) {
figi_infos.push(info);
new_discovered += 1;
}
}
}
Err(e) => eprintln!(" Discovery failed for {}: {}", isin, e),
}
}
if new_discovered > 0 {
println!(" +{} new exchange(s) discovered and added", new_discovered);
}
// --- 5.5 AvailableExchange-Einträge anlegen (für Preis-Downloads) ---
for figi in figi_infos.iter() {
if let Err(e) = add_discovered_exchange(&figi.isin, figi).await {
eprintln!(" Failed to record exchange {}: {}", figi.ticker, e);
}
}
// --- 5.6 Preisdaten von allen Exchanges holen ---
println!(" Fetching price data from {} exchange(s)...", figi_infos.len());
let primary_isin = figi_infos.first().map(|f| f.isin.clone()).unwrap_or_default();
for figi in figi_infos.iter() {
let ticker = &figi.ticker;
let mic = &figi.mic_code;
let is_primary = figi.isin == primary_isin;
let mut daily_success = false;
let mut intraday_success = false;
// Earnings: only fetch from primary ticker to avoid duplicates
if is_primary {
match fetch_earnings_history(client, ticker).await {
Ok(new_events) => {
let result = process_batch(&new_events, &mut existing_events, &today);
save_changes(&result.changes).await?;
println!(" Earnings events: {}", new_events.len());
}
Err(e) => eprintln!(" Failed to fetch earnings for {}: {}", ticker, e),
}
}
// Daily prices
match fetch_daily_price_history(ticker, &config.corporate_start_date, &today).await {
Ok(prices) => {
if !prices.is_empty() {
save_prices_by_source(lei, ticker, "daily", prices).await?;
daily_success = true;
}
}
Err(e) => eprintln!(" Failed to fetch daily prices for {}: {}", ticker, e),
}
// 5-minute intraday (last 60 days)
let sixty_days_ago = (Local::now() - chrono::Duration::days(60))
.format("%Y-%m-%d")
.to_string();
match fetch_price_history_5min(ticker, &sixty_days_ago, &today).await {
Ok(prices) => {
if !prices.is_empty() {
save_prices_by_source(lei, ticker, "5min", prices).await?;
intraday_success = true;
}
}
Err(e) => eprintln!(" Failed to fetch 5min prices for {}: {}", ticker, e),
}
// Update available_exchanges.json (now under LEI folder)
update_available_exchange(&figi.isin, ticker, mic, daily_success, intraday_success).await?;
tokio::time::sleep(tokio::time::Duration::from_millis(800)).await;
}
// --- 5.7 Aggregation aller Quellen → einheitliche USD-Preise ---
println!(" Aggregating price data across all sources (FX-adjusted to USD)");
if let Err(e) = aggregate_best_price_data(lei).await {
eprintln!(" Aggregation failed: {}", e);
} else {
println!(" Aggregation completed successfully");
}
// Metadaten erneut speichern (falls FIGIs hinzugefügt wurden)
save_company_metadata(company).await?;
}
// 6. Optimierte Earnings-Events final speichern
save_optimized_events(existing_events).await?;
println!("\n=== Corporate full update completed successfully ===");
//save_changes(&all_changes).await?;
//println!("Corporate update complete — {} changes detected", all_changes.len());
Ok(())
}
/// Companies mit LEIs anreichern
async fn _enrich_companies_with_leis(
companies: &mut Vec<CompanyMetadata>,
lei_to_isins: &HashMap<String, Vec<String>>,
async fn assign_leis_from_figi(
companies: &mut [CompanyMetadata],
lei_to_isins: &HashMap<String, Vec<String>>
) -> anyhow::Result<()> {
for company in companies.iter_mut() {
if !company.lei.is_empty() {
continue;
}
for company in companies {
let figi_infos = company.figi.as_ref().map_or(&[][..], |v| &v[..]);
let isins: Vec<String> = figi_infos.iter().map(|f| f.isin.clone()).collect::<HashSet<_>>().into_iter().collect();
let isins: Vec<String> = figi_infos
.iter()
.map(|f| f.isin.clone())
.collect::<HashSet<_>>()
.into_iter()
.collect();
// Try to find LEI by any known ISIN
for isin in &isins {
@@ -228,7 +123,7 @@ async fn _enrich_companies_with_leis(
}
}
Ok(())
}
}
pub struct ProcessResult {
pub changes: Vec<CompanyEventChange>,