added companie mapping with yahoo tickers
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -34,6 +34,7 @@ target/
|
|||||||
**/*.zip
|
**/*.zip
|
||||||
**/*.log
|
**/*.log
|
||||||
**/*.ovpn
|
**/*.ovpn
|
||||||
|
**/*.tmp
|
||||||
|
|
||||||
#/economic_events*
|
#/economic_events*
|
||||||
#/economic_event_changes*
|
#/economic_event_changes*
|
||||||
|
|||||||
@@ -7,5 +7,6 @@ pub mod helpers;
|
|||||||
pub mod aggregation;
|
pub mod aggregation;
|
||||||
pub mod fx;
|
pub mod fx;
|
||||||
pub mod openfigi;
|
pub mod openfigi;
|
||||||
|
pub mod yahoo;
|
||||||
|
|
||||||
pub use update::run_full_update;
|
pub use update::run_full_update;
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,318 +1,19 @@
|
|||||||
// src/corporate/scraper.rs
|
// src/corporate/scraper.rs
|
||||||
use super::{types::*, helpers::*, openfigi::*};
|
use super::{types::*};
|
||||||
//use crate::corporate::openfigi::OpenFigiClient;
|
//use crate::corporate::openfigi::OpenFigiClient;
|
||||||
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
|
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
|
||||||
use fantoccini::{Client, Locator};
|
use fantoccini::{Client};
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
use chrono::{DateTime, Duration, NaiveDate, Utc};
|
use chrono::{DateTime, Duration, NaiveDate, Utc};
|
||||||
use tokio::{time::{Duration as TokioDuration, sleep}};
|
use tokio::{time::{Duration as TokioDuration, sleep}};
|
||||||
use reqwest::Client as HttpClient;
|
use reqwest::Client as HttpClient;
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
use zip::ZipArchive;
|
use zip::ZipArchive;
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap};
|
||||||
use std::io::{Read};
|
use std::io::{Read};
|
||||||
use anyhow::{anyhow, Result};
|
|
||||||
|
|
||||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
||||||
|
|
||||||
/// Check if a ticker exists on Yahoo Finance and return core metadata.
|
|
||||||
///
|
|
||||||
/// This function calls the public Yahoo Finance quoteSummary endpoint and extracts:
|
|
||||||
/// - ISIN (when available)
|
|
||||||
/// - Company name
|
|
||||||
/// - Exchange MIC code
|
|
||||||
/// - Trading currency
|
|
||||||
///
|
|
||||||
/// It strictly filters to only accept **equity** securities.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `ticker` - The ticker symbol to validate (e.g., "AAPL", "7203.T", "BMW.DE")
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// `Ok(PrimaryInfo)` on success, `Err` if ticker doesn't exist, is not equity, or data is malformed.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// - Ticker not found
|
|
||||||
/// - Not an equity (ETF, bond, etc.)
|
|
||||||
/// - Missing critical fields
|
|
||||||
/// - Network or JSON parsing errors
|
|
||||||
/*pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
|
||||||
let url = format!(
|
|
||||||
"https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}?modules=price%2CassetProfile",
|
|
||||||
ticker
|
|
||||||
);
|
|
||||||
|
|
||||||
let resp = match HttpClient::new()
|
|
||||||
.get(&url)
|
|
||||||
.header("User-Agent", USER_AGENT)
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(resp) => resp,
|
|
||||||
Err(err) => {
|
|
||||||
return Err(anyhow::anyhow!(
|
|
||||||
"Failed to reach Yahoo Finance for ticker {}: {}",
|
|
||||||
ticker,
|
|
||||||
err
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
return Err(anyhow::anyhow!("Yahoo returned HTTP {} for ticker {}", resp.status(), ticker));
|
|
||||||
}
|
|
||||||
|
|
||||||
let json: Value = match resp
|
|
||||||
.json()
|
|
||||||
.await {
|
|
||||||
Ok(resp) => resp,
|
|
||||||
Err(err) => {
|
|
||||||
return Err(anyhow::anyhow!(
|
|
||||||
"Failed to parse JSON response from Yahoo Finance {}: {}",
|
|
||||||
ticker,
|
|
||||||
err
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let result_array = json["quoteSummary"]["result"]
|
|
||||||
.as_array()
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("Missing 'quoteSummary.result' in response"))?;
|
|
||||||
|
|
||||||
if result_array.is_empty() || result_array[0].is_null() {
|
|
||||||
return Err(anyhow::anyhow!("No quote data returned for ticker {}", ticker));
|
|
||||||
}
|
|
||||||
|
|
||||||
let quote = &result_array[0]["price"];
|
|
||||||
let profile = &result_array[0]["assetProfile"];
|
|
||||||
|
|
||||||
// === 1. Must be EQUITY ===
|
|
||||||
let quote_type = quote["quoteType"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_ascii_uppercase();
|
|
||||||
|
|
||||||
if quote_type != "EQUITY" {
|
|
||||||
println!(" → Skipping {} (quoteType: {})", ticker, quote_type);
|
|
||||||
return Err(anyhow::anyhow!("Not an equity security: {}", quote_type));
|
|
||||||
}
|
|
||||||
|
|
||||||
// === 2. Extract basic info ===
|
|
||||||
let long_name = quote["longName"]
|
|
||||||
.as_str()
|
|
||||||
.or_else(|| quote["shortName"].as_str())
|
|
||||||
.unwrap_or(ticker)
|
|
||||||
.trim()
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let currency = quote["currency"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap_or("USD")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let exchange_mic = quote["exchange"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
if exchange_mic.is_empty() {
|
|
||||||
return Err(anyhow::anyhow!("Missing exchange MIC for ticker {}", ticker));
|
|
||||||
}
|
|
||||||
|
|
||||||
// === 3. Extract ISIN (from assetProfile if available) ===
|
|
||||||
let isin = profile["isin"]
|
|
||||||
.as_str()
|
|
||||||
.and_then(|s| if s.len() == 12 && s.chars().all(|c| c.is_ascii_alphanumeric()) { Some(s) } else { None })
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_ascii_uppercase();
|
|
||||||
|
|
||||||
// === 4. Final sanity check: reject obvious debt securities ===
|
|
||||||
let name_upper = long_name.to_ascii_uppercase();
|
|
||||||
if name_upper.contains(" BOND") ||
|
|
||||||
name_upper.contains(" NOTE") ||
|
|
||||||
name_upper.contains(" DEBENTURE") ||
|
|
||||||
name_upper.contains(" PREFERRED") && !name_upper.contains(" STOCK") {
|
|
||||||
return Err(anyhow::anyhow!("Security name suggests debt instrument: {}", long_name));
|
|
||||||
}
|
|
||||||
|
|
||||||
println!(
|
|
||||||
" → Valid equity: {} | {} | {} | ISIN: {}",
|
|
||||||
ticker,
|
|
||||||
long_name,
|
|
||||||
exchange_mic,
|
|
||||||
if isin.is_empty() { "N/A" } else { &isin }
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(PrimaryInfo {
|
|
||||||
isin,
|
|
||||||
name: long_name,
|
|
||||||
exchange_mic,
|
|
||||||
currency,
|
|
||||||
})
|
|
||||||
}*/
|
|
||||||
|
|
||||||
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
|
||||||
///
|
|
||||||
/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
|
|
||||||
/// reject cookies, and extract the events.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `ticker` - The stock ticker symbol.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of CompanyEvent structs on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
|
|
||||||
pub async fn fetch_earnings_with_pool(
|
|
||||||
ticker: &str,
|
|
||||||
pool: &Arc<ChromeDriverPool>,
|
|
||||||
) -> anyhow::Result<Vec<CompanyEvent>> {
|
|
||||||
let ticker = ticker.to_string();
|
|
||||||
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker);
|
|
||||||
|
|
||||||
let ticker_cloned = ticker.clone();
|
|
||||||
|
|
||||||
pool.execute(url, move |client| {
|
|
||||||
let ticker = ticker_cloned.clone();
|
|
||||||
Box::pin(async move {
|
|
||||||
reject_yahoo_cookies(&client).await?;
|
|
||||||
extract_earnings_events(&client, &ticker).await
|
|
||||||
})
|
|
||||||
}).await
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
|
|
||||||
///
|
|
||||||
/// This function assumes the client is already navigated to the correct URL (e.g.,
|
|
||||||
/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
|
|
||||||
///
|
|
||||||
/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
|
|
||||||
/// and handles date parsing, float parsing, and optional fields.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `client` - The fantoccini Client with the page loaded.
|
|
||||||
/// * `ticker` - The stock ticker symbol for the events.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of CompanyEvent on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if:
|
|
||||||
/// - Table or elements not found.
|
|
||||||
/// - Date or float parsing fails.
|
|
||||||
/// - WebDriver operations fail.
|
|
||||||
///
|
|
||||||
/// # Examples
|
|
||||||
///
|
|
||||||
/// ```no_run
|
|
||||||
/// use fantoccini::Client;
|
|
||||||
/// use crate::corporate::scraper::extract_earnings;
|
|
||||||
///
|
|
||||||
/// #[tokio::main]
|
|
||||||
/// async fn main() -> Result<()> {
|
|
||||||
/// // Assume client is set up and navigated
|
|
||||||
/// let events = extract_earnings(&client, "AAPL").await?;
|
|
||||||
/// Ok(())
|
|
||||||
/// }
|
|
||||||
/// ```
|
|
||||||
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
|
|
||||||
// Wait for the table to load
|
|
||||||
let table = client
|
|
||||||
.wait()
|
|
||||||
.for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
|
|
||||||
|
|
||||||
// Find all rows in tbody
|
|
||||||
let rows = table
|
|
||||||
.find_all(Locator::Css("tbody tr"))
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
|
|
||||||
|
|
||||||
let mut events = Vec::with_capacity(rows.len());
|
|
||||||
|
|
||||||
for row in rows {
|
|
||||||
let cells = row
|
|
||||||
.find_all(Locator::Css("td"))
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
|
|
||||||
|
|
||||||
if cells.len() < 5 {
|
|
||||||
continue; // Skip incomplete rows
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract and parse date
|
|
||||||
let date_str = cells[0]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get date text: {}", e))?;
|
|
||||||
let date = parse_yahoo_date(&date_str)
|
|
||||||
.map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
|
|
||||||
.format("%Y-%m-%d")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
// Extract time, replace "Time Not Supplied" with empty
|
|
||||||
let time = cells[1]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get time text: {}", e))?
|
|
||||||
.replace("Time Not Supplied", "");
|
|
||||||
|
|
||||||
// Extract period
|
|
||||||
let period = cells[2]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get period text: {}", e))?;
|
|
||||||
|
|
||||||
// Parse EPS forecast
|
|
||||||
let eps_forecast_str = cells[3]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
|
|
||||||
let eps_forecast = parse_float(&eps_forecast_str);
|
|
||||||
|
|
||||||
// Parse EPS actual
|
|
||||||
let eps_actual_str = cells[4]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
|
|
||||||
let eps_actual = parse_float(&eps_actual_str);
|
|
||||||
|
|
||||||
// Parse surprise % if available
|
|
||||||
let surprise_pct = if cells.len() > 5 {
|
|
||||||
let surprise_str = cells[5]
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
|
|
||||||
parse_float(&surprise_str)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
events.push(CompanyEvent {
|
|
||||||
ticker: ticker.to_string(),
|
|
||||||
date,
|
|
||||||
time,
|
|
||||||
period,
|
|
||||||
eps_forecast,
|
|
||||||
eps_actual,
|
|
||||||
revenue_forecast: None,
|
|
||||||
revenue_actual: None,
|
|
||||||
surprise_pct,
|
|
||||||
source: "Yahoo".to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if events.is_empty() {
|
|
||||||
eprintln!("Warning: No earnings events extracted for ticker {}", ticker);
|
|
||||||
} else {
|
|
||||||
println!("Extracted {} earnings events for {}", events.len(), ticker);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(events)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_price(v: Option<&Value>) -> f64 {
|
fn parse_price(v: Option<&Value>) -> f64 {
|
||||||
v.and_then(|x| x.as_str())
|
v.and_then(|x| x.as_str())
|
||||||
.and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
|
.and_then(|s| s.replace('$', "").replace(',', "").parse::<f64>().ok())
|
||||||
@@ -490,20 +191,17 @@ pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow
|
|||||||
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
||||||
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
||||||
|
|
||||||
// Initialize DataPaths and create cache/gleif directory
|
|
||||||
let paths = DataPaths::new(".")?;
|
let paths = DataPaths::new(".")?;
|
||||||
let gleif_cache_dir = paths.cache_gleif_dir();
|
let gleif_cache_dir = paths.cache_gleif_dir();
|
||||||
|
|
||||||
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
|
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
|
||||||
let msg = format!("Failed to create cache/gleif directory: {}", e);
|
let msg = format!("Failed to create cache/gleif directory: {}", e);
|
||||||
logger::log_error(&msg).await;
|
logger::log_error(&msg).await;
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info("Corporate Scraper: Downloading ISIN/LEI mapping from GLEIF...").await;
|
logger::log_info("Downloading ISIN/LEI mapping from GLEIF...").await;
|
||||||
|
|
||||||
// Download ZIP and get the filename from Content-Disposition header
|
|
||||||
let client = match reqwest::Client::builder()
|
let client = match reqwest::Client::builder()
|
||||||
.user_agent(USER_AGENT)
|
.user_agent(USER_AGENT)
|
||||||
.timeout(std::time::Duration::from_secs(30))
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
@@ -511,9 +209,7 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
{
|
{
|
||||||
Ok(c) => c,
|
Ok(c) => c,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
let msg = format!("Failed to create HTTP client: {}", e);
|
logger::log_error(&format!("Failed to create HTTP client: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -521,20 +217,15 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
let resp = match client.get(url).send().await {
|
let resp = match client.get(url).send().await {
|
||||||
Ok(r) if r.status().is_success() => r,
|
Ok(r) if r.status().is_success() => r,
|
||||||
Ok(resp) => {
|
Ok(resp) => {
|
||||||
let msg = format!("Server returned HTTP {}", resp.status());
|
logger::log_error(&format!("Server returned HTTP {}", resp.status())).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
let msg = format!("Failed to download ISIN/LEI ZIP: {}", e);
|
logger::log_error(&format!("Failed to download: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Extract filename from Content-Disposition header or use default
|
|
||||||
let filename = resp
|
let filename = resp
|
||||||
.headers()
|
.headers()
|
||||||
.get("content-disposition")
|
.get("content-disposition")
|
||||||
@@ -542,11 +233,10 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
|
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
|
||||||
.unwrap_or_else(|| "isin_lei.zip".to_string());
|
.unwrap_or_else(|| "isin_lei.zip".to_string());
|
||||||
|
|
||||||
// Parse timestamp from filename and convert to DDMMYYYY format
|
|
||||||
let parsed_filename = parse_gleif_filename(&filename);
|
let parsed_filename = parse_gleif_filename(&filename);
|
||||||
logger::log_info(&format!("Corporate Scraper: Downloaded file: {} -> {}", filename, parsed_filename)).await;
|
logger::log_info(&format!("Downloaded: {} -> {}", filename, parsed_filename)).await;
|
||||||
|
|
||||||
// Determine date (DDMMYYYY) from parsed filename: "isin-lei-DDMMYYYY.csv"
|
// Extract date from filename
|
||||||
let mut date_str = String::new();
|
let mut date_str = String::new();
|
||||||
if let Some(start_idx) = parsed_filename.find("isin-lei-") {
|
if let Some(start_idx) = parsed_filename.find("isin-lei-") {
|
||||||
let rest = &parsed_filename[start_idx + 9..];
|
let rest = &parsed_filename[start_idx + 9..];
|
||||||
@@ -555,13 +245,10 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we parsed a date, use/create a date folder under cache/gleif and operate inside it; otherwise use cache root.
|
|
||||||
let date_dir = if !date_str.is_empty() {
|
let date_dir = if !date_str.is_empty() {
|
||||||
let p = gleif_cache_dir.join(&date_str);
|
let p = gleif_cache_dir.join(&date_str);
|
||||||
// Ensure the date folder exists (create if necessary)
|
|
||||||
if let Err(e) = std::fs::create_dir_all(&p) {
|
if let Err(e) = std::fs::create_dir_all(&p) {
|
||||||
let msg = format!("Failed to create date directory {:?}: {}", p, e);
|
logger::log_warn(&format!("Failed to create date directory: {}", e)).await;
|
||||||
logger::log_warn(&msg).await;
|
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(p)
|
Some(p)
|
||||||
@@ -570,17 +257,16 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
// Choose the directory where we'll look for existing files and where we'll save the new ones
|
|
||||||
let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());
|
let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());
|
||||||
|
|
||||||
// If the date folder exists (or was created), prefer any *_clean.csv inside it and return that immediately
|
// Check for existing clean CSV
|
||||||
if let Some(ref ddir) = date_dir {
|
if let Some(ref ddir) = date_dir {
|
||||||
if let Ok(entries) = std::fs::read_dir(ddir) {
|
if let Ok(entries) = std::fs::read_dir(ddir) {
|
||||||
for entry in entries.flatten() {
|
for entry in entries.flatten() {
|
||||||
if let Some(name) = entry.file_name().to_str() {
|
if let Some(name) = entry.file_name().to_str() {
|
||||||
if name.to_lowercase().ends_with("_clean.csv") {
|
if name.to_lowercase().ends_with("_clean.csv") {
|
||||||
let path = ddir.join(name);
|
let path = ddir.join(name);
|
||||||
logger::log_info(&format!("Found existing clean GLEIF CSV: {}", path.display())).await;
|
logger::log_info(&format!("Found existing clean CSV: {}", path.display())).await;
|
||||||
return Ok(Some(path.to_string_lossy().to_string()));
|
return Ok(Some(path.to_string_lossy().to_string()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -588,71 +274,42 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If no clean file found in the date folder (or date folder doesn't exist), check whether the csv/zip already exist in the target dir
|
let csv_candidate = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
||||||
let csv_candidate_name = parsed_filename.replace(".zip", ".csv");
|
|
||||||
let csv_candidate = target_dir.join(&csv_candidate_name);
|
|
||||||
let zip_candidate = target_dir.join(&parsed_filename);
|
|
||||||
|
|
||||||
if csv_candidate.exists() {
|
if csv_candidate.exists() {
|
||||||
logger::log_info(&format!("Found existing GLEIF CSV: {}", csv_candidate.display())).await;
|
logger::log_info(&format!("Found existing CSV: {}", csv_candidate.display())).await;
|
||||||
return Ok(Some(csv_candidate.to_string_lossy().to_string()));
|
return Ok(Some(csv_candidate.to_string_lossy().to_string()));
|
||||||
}
|
}
|
||||||
if zip_candidate.exists() {
|
|
||||||
// If zip exists but csv does not, extract later; for now prefer returning csv path (may be created by extraction step)
|
|
||||||
let inferred_csv = target_dir.join(csv_candidate_name);
|
|
||||||
if inferred_csv.exists() {
|
|
||||||
logger::log_info(&format!("Found existing extracted CSV next to ZIP: {}", inferred_csv.display())).await;
|
|
||||||
return Ok(Some(inferred_csv.to_string_lossy().to_string()));
|
|
||||||
}
|
|
||||||
// otherwise we'll overwrite/extract into target_dir below
|
|
||||||
}
|
|
||||||
|
|
||||||
let bytes = match resp.bytes().await {
|
let bytes = match resp.bytes().await {
|
||||||
Ok(b) => b,
|
Ok(b) => b,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
let msg = format!("Failed to read ZIP bytes: {}", e);
|
logger::log_error(&format!("Failed to read bytes: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// Ensure target directory exists (create if it's the date folder and was absent earlier)
|
|
||||||
if let Some(ref ddir) = date_dir {
|
|
||||||
let _ = std::fs::create_dir_all(ddir);
|
|
||||||
}
|
|
||||||
|
|
||||||
let zip_path = target_dir.join(&parsed_filename);
|
let zip_path = target_dir.join(&parsed_filename);
|
||||||
let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
||||||
|
|
||||||
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
|
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
|
||||||
let msg = format!("Failed to write ZIP file: {}", e);
|
logger::log_error(&format!("Failed to write ZIP: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
logger::log_info(&format!("Corporate Scraper: Saved ZIP to {:?}", zip_path)).await;
|
|
||||||
|
|
||||||
// Extract CSV
|
// Extract CSV from ZIP
|
||||||
let archive = match std::fs::File::open(&zip_path)
|
let archive = match std::fs::File::open(&zip_path).map(ZipArchive::new) {
|
||||||
.map(ZipArchive::new)
|
|
||||||
{
|
|
||||||
Ok(Ok(a)) => a,
|
Ok(Ok(a)) => a,
|
||||||
Ok(Err(e)) => {
|
Ok(Err(e)) => {
|
||||||
let msg = format!("Invalid ZIP: {}", e);
|
logger::log_error(&format!("Invalid ZIP: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
let msg = format!("Cannot open ZIP file: {}", e);
|
logger::log_error(&format!("Cannot open ZIP: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut archive = archive;
|
let mut archive = archive;
|
||||||
|
|
||||||
let idx = match (0..archive.len()).find(|&i| {
|
let idx = match (0..archive.len()).find(|&i| {
|
||||||
archive.by_index(i)
|
archive.by_index(i)
|
||||||
.map(|f| f.name().ends_with(".csv"))
|
.map(|f| f.name().ends_with(".csv"))
|
||||||
@@ -660,9 +317,7 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
}) {
|
}) {
|
||||||
Some(i) => i,
|
Some(i) => i,
|
||||||
None => {
|
None => {
|
||||||
let msg = "ZIP did not contain a CSV file";
|
logger::log_error("ZIP contains no CSV").await;
|
||||||
logger::log_error(msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -670,43 +325,32 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
let mut csv_file = match archive.by_index(idx) {
|
let mut csv_file = match archive.by_index(idx) {
|
||||||
Ok(f) => f,
|
Ok(f) => f,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
let msg = format!("Failed to read CSV entry: {}", e);
|
logger::log_error(&format!("Failed to read CSV: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
println!("{}", msg);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut csv_bytes = Vec::new();
|
let mut csv_bytes = Vec::new();
|
||||||
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
||||||
let msg = format!("Failed to extract CSV: {}", e);
|
logger::log_error(&format!("Failed to extract: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
|
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
|
||||||
let msg = format!("Failed to save CSV file: {}", e);
|
logger::log_error(&format!("Failed to save CSV: {}", e)).await;
|
||||||
logger::log_error(&msg).await;
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
let msg = format!("✓ ISIN/LEI CSV extracted: {:?}", csv_path);
|
logger::log_info(&format!("✓ CSV extracted: {:?}", csv_path)).await;
|
||||||
logger::log_info(&msg).await;
|
|
||||||
|
|
||||||
Ok(Some(csv_path.to_string_lossy().to_string()))
|
Ok(Some(csv_path.to_string_lossy().to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse GLEIF filename and convert timestamp to DDMMYYYY format
|
|
||||||
/// Example: "isin-lei-20251124T080254.csv" -> "isin-lei-24112025.csv"
|
|
||||||
fn parse_gleif_filename(filename: &str) -> String {
|
fn parse_gleif_filename(filename: &str) -> String {
|
||||||
// Try to find pattern: isin-lei-YYYYMMDDTHHMMSS.zip/csv
|
|
||||||
if let Some(start_idx) = filename.find("isin-lei-") {
|
if let Some(start_idx) = filename.find("isin-lei-") {
|
||||||
let rest = &filename[start_idx + 9..]; // After "isin-lei-"
|
let rest = &filename[start_idx + 9..];
|
||||||
|
|
||||||
// Extract the 8 digits (YYYYMMDD)
|
|
||||||
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
|
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
|
||||||
let date_part = &rest[0..8];
|
let date_part = &rest[0..8];
|
||||||
// date_part is YYYYMMDD, convert to DDMMYYYY
|
|
||||||
if date_part.len() == 8 {
|
if date_part.len() == 8 {
|
||||||
let year = &date_part[0..4];
|
let year = &date_part[0..4];
|
||||||
let month = &date_part[4..6];
|
let month = &date_part[4..6];
|
||||||
@@ -717,11 +361,9 @@ fn parse_gleif_filename(filename: &str) -> String {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: return original filename if parsing fails
|
|
||||||
filename.to_string()
|
filename.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
|
pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>> {
|
||||||
// 1. Download + extract the CSV (this is now async)
|
// 1. Download + extract the CSV (this is now async)
|
||||||
let csv_path = match download_isin_lei_csv().await? {
|
let csv_path = match download_isin_lei_csv().await? {
|
||||||
@@ -769,30 +411,4 @@ pub async fn load_isin_lei_csv() -> anyhow::Result<HashMap<String, Vec<String>>>
|
|||||||
);
|
);
|
||||||
|
|
||||||
Ok(map)
|
Ok(map)
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
|
|
||||||
for _ in 0..10 {
|
|
||||||
let clicked: bool = client
|
|
||||||
.execute(
|
|
||||||
r#"(() => {
|
|
||||||
const btn = document.querySelector('#consent-page .reject-all');
|
|
||||||
if (btn) {
|
|
||||||
btn.click();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
})()"#,
|
|
||||||
vec![],
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
.as_bool()
|
|
||||||
.unwrap_or(false);
|
|
||||||
|
|
||||||
if clicked { break; }
|
|
||||||
sleep(TokioDuration::from_millis(500)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("Rejected Yahoo cookies if button existed");
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
@@ -6,49 +6,12 @@ use crate::util::logger;
|
|||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
use chrono::{Datelike, NaiveDate};
|
use chrono::{Datelike, NaiveDate};
|
||||||
use std::collections::{HashMap};
|
use std::collections::HashMap;
|
||||||
use std::path::{PathBuf, Path};
|
use std::path::{PathBuf, Path};
|
||||||
|
|
||||||
const BATCH_SIZE: usize = 500; // Process 500 events at a time
|
const BATCH_SIZE: usize = 500;
|
||||||
|
|
||||||
/// Load events in streaming fashion to avoid memory buildup
|
/// Lightweight index entry - only metadata, no full event data
|
||||||
pub async fn load_existing_events_streaming(
|
|
||||||
paths: &DataPaths,
|
|
||||||
callback: impl Fn(CompanyEvent) -> anyhow::Result<()>
|
|
||||||
) -> anyhow::Result<usize> {
|
|
||||||
let dir = paths.corporate_events_dir();
|
|
||||||
if !dir.exists() {
|
|
||||||
logger::log_info("Corporate Storage: No existing events directory found").await;
|
|
||||||
return Ok(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut total = 0;
|
|
||||||
let mut entries = fs::read_dir(dir).await?;
|
|
||||||
|
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
|
||||||
let path = entry.path();
|
|
||||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
|
||||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
|
||||||
if name.starts_with("events_") && name.len() == 17 {
|
|
||||||
let content = fs::read_to_string(&path).await?;
|
|
||||||
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
|
||||||
|
|
||||||
for event in events {
|
|
||||||
callback(event)?;
|
|
||||||
total += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Yield to prevent blocking
|
|
||||||
tokio::task::yield_now().await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger::log_info(&format!("Corporate Storage: Streamed {} events", total)).await;
|
|
||||||
Ok(total)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Build lightweight index of events instead of loading everything
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct EventIndex {
|
pub struct EventIndex {
|
||||||
pub key: String,
|
pub key: String,
|
||||||
@@ -57,9 +20,11 @@ pub struct EventIndex {
|
|||||||
pub file_path: PathBuf,
|
pub file_path: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Build index of all events without loading them into memory
|
||||||
pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventIndex>> {
|
pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventIndex>> {
|
||||||
let dir = paths.corporate_events_dir();
|
let dir = paths.corporate_events_dir();
|
||||||
if !dir.exists() {
|
if !dir.exists() {
|
||||||
|
logger::log_info("Corporate Storage: No events directory found").await;
|
||||||
return Ok(Vec::new());
|
return Ok(Vec::new());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,7 +55,7 @@ pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventInd
|
|||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Lookup specific event by loading only its file
|
/// Load specific event by key (only loads its file)
|
||||||
pub async fn lookup_event_by_key(
|
pub async fn lookup_event_by_key(
|
||||||
key: &str,
|
key: &str,
|
||||||
index: &[EventIndex]
|
index: &[EventIndex]
|
||||||
@@ -106,9 +71,48 @@ pub async fn lookup_event_by_key(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Stream events file by file with callback
|
||||||
|
pub async fn stream_events_with_callback<F>(
|
||||||
|
paths: &DataPaths,
|
||||||
|
mut callback: F
|
||||||
|
) -> anyhow::Result<usize>
|
||||||
|
where
|
||||||
|
F: FnMut(CompanyEvent) -> anyhow::Result<()>,
|
||||||
|
{
|
||||||
|
let dir = paths.corporate_events_dir();
|
||||||
|
if !dir.exists() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut total = 0;
|
||||||
|
let mut entries = fs::read_dir(dir).await?;
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||||
|
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||||
|
if name.starts_with("events_") {
|
||||||
|
let content = fs::read_to_string(&path).await?;
|
||||||
|
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
|
for event in events {
|
||||||
|
callback(event)?;
|
||||||
|
total += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::task::yield_now().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Corporate Storage: Streamed {} events", total)).await;
|
||||||
|
Ok(total)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save events organized by month (accepts Vec, not HashMap)
|
||||||
pub async fn save_optimized_events(
|
pub async fn save_optimized_events(
|
||||||
paths: &DataPaths,
|
paths: &DataPaths,
|
||||||
events: Vec<CompanyEvent> // Changed from HashMap to Vec
|
events: Vec<CompanyEvent>
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let dir = paths.corporate_events_dir();
|
let dir = paths.corporate_events_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
@@ -124,16 +128,14 @@ pub async fn save_optimized_events(
|
|||||||
removed_count += 1;
|
removed_count += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger::log_info(&format!("Corporate Storage: Removed {} old event files", removed_count)).await;
|
logger::log_info(&format!("Corporate Storage: Removed {} old files", removed_count)).await;
|
||||||
|
|
||||||
let total_events = events.len();
|
let total_events = events.len();
|
||||||
let mut sorted = events;
|
let mut sorted = events;
|
||||||
sorted.sort_by(|a, b| {
|
sorted.sort_by(|a, b| {
|
||||||
a.ticker.cmp(&b.ticker)
|
a.ticker.cmp(&b.ticker).then(a.date.cmp(&b.date))
|
||||||
.then(a.date.cmp(&b.date))
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Process in batches to avoid memory buildup
|
|
||||||
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
|
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
|
||||||
|
|
||||||
for chunk in sorted.chunks(BATCH_SIZE) {
|
for chunk in sorted.chunks(BATCH_SIZE) {
|
||||||
@@ -146,27 +148,28 @@ pub async fn save_optimized_events(
|
|||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
let total_months = by_month.len();
|
|
||||||
for (month, list) in by_month {
|
for (month, list) in by_month {
|
||||||
let path = dir.join(format!("events_{}.json", month));
|
let path = dir.join(format!("events_{}.json", month));
|
||||||
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
||||||
logger::log_info(&format!("Corporate Storage: Saved {} events for month {}", list.len(), month)).await;
|
logger::log_info(&format!("Saved {} events for month {}", list.len(), month)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info(&format!("Corporate Storage: Saved {} total events in {} month files", total_events, total_months)).await;
|
logger::log_info(&format!("Saved {} total events", total_events)).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_changes(paths: &DataPaths, changes: &[CompanyEventChange]) -> anyhow::Result<()> {
|
pub async fn save_changes(
|
||||||
|
paths: &DataPaths,
|
||||||
|
changes: &[CompanyEventChange]
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
if changes.is_empty() {
|
if changes.is_empty() {
|
||||||
logger::log_info("Corporate Storage: No changes to save").await;
|
logger::log_info("Corporate Storage: No changes to save").await;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let dir = paths.corporate_changes_dir();
|
let dir = paths.corporate_changes_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
|
|
||||||
logger::log_info(&format!("Corporate Storage: Saving {} changes", changes.len())).await;
|
|
||||||
|
|
||||||
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
|
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
|
||||||
for c in changes {
|
for c in changes {
|
||||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||||
@@ -180,12 +183,13 @@ pub async fn save_changes(paths: &DataPaths, changes: &[CompanyEventChange]) ->
|
|||||||
let mut all = if path.exists() {
|
let mut all = if path.exists() {
|
||||||
let s = fs::read_to_string(&path).await?;
|
let s = fs::read_to_string(&path).await?;
|
||||||
serde_json::from_str(&s).unwrap_or_default()
|
serde_json::from_str(&s).unwrap_or_default()
|
||||||
} else { vec![] };
|
} else {
|
||||||
|
vec![]
|
||||||
|
};
|
||||||
all.extend(list.clone());
|
all.extend(list.clone());
|
||||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||||
logger::log_info(&format!("Corporate Storage: Saved {} changes for month {}", list.len(), month)).await;
|
|
||||||
}
|
}
|
||||||
logger::log_info("Corporate Storage: All changes saved successfully").await;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -203,9 +207,7 @@ pub async fn save_prices_for_ticker(
|
|||||||
let path = timeframe_dir.join("prices.json");
|
let path = timeframe_dir.join("prices.json");
|
||||||
|
|
||||||
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
|
||||||
|
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
|
||||||
let json = serde_json::to_string_pretty(&prices)?;
|
|
||||||
fs::write(&path, json).await?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -240,7 +242,10 @@ pub async fn save_available_exchanges(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn load_available_exchanges(paths: &DataPaths, lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
|
pub async fn load_available_exchanges(
|
||||||
|
paths: &DataPaths,
|
||||||
|
lei: &str
|
||||||
|
) -> anyhow::Result<Vec<AvailableExchange>> {
|
||||||
let path = get_company_dir(paths, lei).join("available_exchanges.json");
|
let path = get_company_dir(paths, lei).join("available_exchanges.json");
|
||||||
if path.exists() {
|
if path.exists() {
|
||||||
let content = fs::read_to_string(&path).await?;
|
let content = fs::read_to_string(&path).await?;
|
||||||
@@ -267,15 +272,13 @@ pub async fn save_prices_by_source(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Saves companies data to a JSONL file in streaming fashion
|
/// Stream companies to JSONL incrementally
|
||||||
pub async fn save_companies_to_jsonl_streaming(
|
pub async fn save_companies_to_jsonl_streaming(
|
||||||
paths: &DataPaths,
|
paths: &DataPaths,
|
||||||
companies: &HashMap<String, HashMap<String, String>>,
|
companies_iter: impl Iterator<Item = (String, HashMap<String, String>)>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<usize> {
|
||||||
let file_path = paths.data_dir().join("companies.jsonl");
|
let file_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
|
||||||
logger::log_info(&format!("Corporate Storage: Saving {} companies to JSONL", companies.len())).await;
|
|
||||||
|
|
||||||
if let Some(parent) = file_path.parent() {
|
if let Some(parent) = file_path.parent() {
|
||||||
tokio::fs::create_dir_all(parent).await?;
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
}
|
}
|
||||||
@@ -283,32 +286,33 @@ pub async fn save_companies_to_jsonl_streaming(
|
|||||||
let mut file = tokio::fs::File::create(&file_path).await?;
|
let mut file = tokio::fs::File::create(&file_path).await?;
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
|
|
||||||
// Process in batches
|
for (name, securities) in companies_iter {
|
||||||
for (name, securities) in companies.iter() {
|
|
||||||
let line = serde_json::json!({
|
let line = serde_json::json!({
|
||||||
"name": name,
|
"name": name,
|
||||||
"securities": securities
|
"securities": securities
|
||||||
});
|
});
|
||||||
|
|
||||||
file.write_all(line.to_string().as_bytes()).await?;
|
file.write_all(line.to_string().as_bytes()).await?;
|
||||||
file.write_all(b"\n").await?;
|
file.write_all(b"\n").await?;
|
||||||
|
|
||||||
count += 1;
|
count += 1;
|
||||||
|
|
||||||
if count % 100 == 0 {
|
if count % 100 == 0 {
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let msg = format!("✓ Saved {} companies to {:?}", companies.len(), file_path);
|
logger::log_info(&format!("Saved {} companies to JSONL", count)).await;
|
||||||
println!("{}", msg);
|
Ok(count)
|
||||||
logger::log_info(&msg).await;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Load companies from JSONL in streaming fashion
|
/// Stream read companies from JSONL
|
||||||
pub async fn load_companies_from_jsonl_streaming(
|
pub async fn stream_companies_from_jsonl<F>(
|
||||||
path: &Path,
|
path: &Path,
|
||||||
callback: impl Fn(String, HashMap<String, String>) -> anyhow::Result<()>
|
mut callback: F
|
||||||
) -> anyhow::Result<usize> {
|
) -> anyhow::Result<usize>
|
||||||
|
where
|
||||||
|
F: FnMut(String, HashMap<String, String>) -> anyhow::Result<()>,
|
||||||
|
{
|
||||||
if !path.exists() {
|
if !path.exists() {
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,15 +79,11 @@ pub struct CompanyInfo{
|
|||||||
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Company Meta Data
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
/// # Attributes
|
pub struct CompanyCrossPlatformInfo {
|
||||||
/// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
pub name: String,
|
||||||
/// * figi: metadata with ISIN as key
|
pub isin_tickers_map: HashMap<String, Vec<String>>, // ISIN -> Tickers
|
||||||
/*#[derive(Debug, Clone, Serialize, Deserialize)]
|
}
|
||||||
pub struct CompanyMetadata {
|
|
||||||
pub lei: String,
|
|
||||||
pub figi: Option<Vec<FigiInfo>>,
|
|
||||||
}*/
|
|
||||||
|
|
||||||
/// Warrant Info
|
/// Warrant Info
|
||||||
///
|
///
|
||||||
@@ -118,14 +114,6 @@ pub struct OptionInfo {
|
|||||||
pub options: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
pub options: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct PrimaryInfo {
|
|
||||||
pub isin: String,
|
|
||||||
pub name: String,
|
|
||||||
pub exchange_mic: String,
|
|
||||||
pub currency: String,
|
|
||||||
}*/
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct AvailableExchange {
|
pub struct AvailableExchange {
|
||||||
pub exchange_mic: String,
|
pub exchange_mic: String,
|
||||||
|
|||||||
@@ -1,170 +1,274 @@
|
|||||||
// src/corporate/update.rs
|
// src/corporate/update.rs - COMPLETE STREAMING VERSION
|
||||||
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
|
|
||||||
|
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
|
||||||
use crate::config::Config;
|
use crate::config::Config;
|
||||||
use crate::util::directories::DataPaths;
|
use crate::util::directories::DataPaths;
|
||||||
use crate::util::logger;
|
use crate::util::logger;
|
||||||
use crate::scraper::webdriver::ChromeDriverPool;
|
use crate::scraper::webdriver::ChromeDriverPool;
|
||||||
|
|
||||||
use chrono::Local;
|
use chrono::Local;
|
||||||
use std::collections::{HashMap};
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
/// Main function: Full update for all companies with streaming to minimize memory usage
|
/// Main update function - fully streaming, minimal memory usage
|
||||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
pub async fn run_full_update(_config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
||||||
let msg = "=== Starting LEI-based corporate full update (STREAMING) ===";
|
logger::log_info("=== Corporate Update (STREAMING MODE) ===").await;
|
||||||
println!("{}", msg);
|
|
||||||
logger::log_info(msg).await;
|
|
||||||
|
|
||||||
let paths = DataPaths::new(".")?;
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
// Step 1: Download/locate GLEIF CSV (don't load into memory yet)
|
// Step 1: Download GLEIF CSV (don't load into memory)
|
||||||
logger::log_info("Corporate Update: Downloading/locating GLEIF CSV...").await;
|
logger::log_info("Step 1: Downloading GLEIF CSV...").await;
|
||||||
let gleif_csv_path = match download_isin_lei_csv().await? {
|
let gleif_csv_path = match download_isin_lei_csv().await? {
|
||||||
Some(p) => {
|
Some(p) => {
|
||||||
logger::log_info(&format!("Corporate Update: GLEIF CSV at: {}", p)).await;
|
logger::log_info(&format!(" ✓ GLEIF CSV at: {}", p)).await;
|
||||||
p
|
p
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
logger::log_warn("Corporate Update: Could not obtain GLEIF CSV, continuing with limited data").await;
|
logger::log_warn(" ✗ Could not obtain GLEIF CSV").await;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Step 2: Load OpenFIGI type lists (small, cached)
|
// Step 2: Load OpenFIGI type lists (small, cached)
|
||||||
logger::log_info("Corporate Update: Loading OpenFIGI type lists...").await;
|
logger::log_info("Step 2: Loading OpenFIGI metadata...").await;
|
||||||
if let Err(e) = load_figi_type_lists().await {
|
load_figi_type_lists().await.ok();
|
||||||
logger::log_warn(&format!("Could not load OpenFIGI type lists: {}", e)).await;
|
logger::log_info(" ✓ OpenFIGI metadata loaded").await;
|
||||||
|
|
||||||
|
// Step 3: Check mapping status and process only unmapped LEIs
|
||||||
|
logger::log_info("Step 3: Checking LEI-FIGI mapping status...").await;
|
||||||
|
|
||||||
|
let all_mapped = ensure_all_leis_mapped(&gleif_csv_path, None).await?;
|
||||||
|
|
||||||
|
if !all_mapped {
|
||||||
|
logger::log_warn(" ⚠ Some LEIs failed to map - continuing with partial data").await;
|
||||||
|
} else {
|
||||||
|
logger::log_info(" ✓ All LEIs successfully mapped").await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 3: Process GLEIF → FIGI mapping in streaming fashion
|
// Step 4: Build securities from FIGI data (streaming)
|
||||||
logger::log_info("Corporate Update: Building FIGI mappings (streaming)...").await;
|
logger::log_info("Step 4: Building securities map (streaming)...").await;
|
||||||
|
let date_dir = find_most_recent_figi_date_dir(&paths).await?;
|
||||||
|
|
||||||
// Build LEI→ISINs map by streaming the CSV
|
if let Some(date_dir) = date_dir {
|
||||||
let mut lei_to_isins: HashMap<String, Vec<String>> = HashMap::new();
|
logger::log_info(&format!(" Using FIGI data from: {:?}", date_dir)).await;
|
||||||
let mut lei_batch = Vec::new();
|
build_securities_from_figi_streaming(&date_dir).await?;
|
||||||
const LEI_BATCH_SIZE: usize = 1000;
|
logger::log_info(" ✓ Securities map updated").await;
|
||||||
|
|
||||||
stream_gleif_csv(&gleif_csv_path, |lei, isin| {
|
|
||||||
lei_to_isins.entry(lei.clone()).or_default().push(isin);
|
|
||||||
lei_batch.push(lei);
|
|
||||||
|
|
||||||
// Process in batches
|
|
||||||
if lei_batch.len() >= LEI_BATCH_SIZE {
|
|
||||||
lei_batch.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}).await?;
|
|
||||||
|
|
||||||
logger::log_info(&format!("Corporate Update: Collected {} LEIs", lei_to_isins.len())).await;
|
|
||||||
|
|
||||||
// Step 4: Build FIGI mappings in batches (process and save incrementally)
|
|
||||||
logger::log_info("Corporate Update: Processing FIGI mappings in batches...").await;
|
|
||||||
let figi_result = build_lei_to_figi_infos(&lei_to_isins, None).await;
|
|
||||||
|
|
||||||
// Don't keep the full result in memory - it's already saved to JSONL files
|
|
||||||
drop(figi_result);
|
|
||||||
drop(lei_to_isins); // Release this too
|
|
||||||
|
|
||||||
logger::log_info("Corporate Update: FIGI mappings saved to cache").await;
|
|
||||||
|
|
||||||
// Step 5: Load or build securities (streaming from JSONL files)
|
|
||||||
logger::log_info("Corporate Update: Building securities map (streaming)...").await;
|
|
||||||
|
|
||||||
let dir = DataPaths::new(".")?;
|
|
||||||
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
|
|
||||||
|
|
||||||
// Find the most recent date directory
|
|
||||||
let date_dir = find_most_recent_date_dir(&map_cache_dir).await?;
|
|
||||||
|
|
||||||
let (common_stocks, _warrants, _options) = if let Some(date_dir) = date_dir {
|
|
||||||
logger::log_info(&format!("Using FIGI data from: {:?}", date_dir)).await;
|
|
||||||
load_or_build_all_securities_streaming(&date_dir).await?
|
|
||||||
} else {
|
} else {
|
||||||
logger::log_warn("No FIGI date directory found, using empty maps").await;
|
logger::log_warn(" ✗ No FIGI data directory found").await;
|
||||||
(HashMap::new(), HashMap::new(), HashMap::new())
|
}
|
||||||
};
|
|
||||||
|
|
||||||
logger::log_info(&format!("Corporate Update: Processing {} companies", common_stocks.len())).await;
|
// Step 5: Build companies JSONL (streaming from securities)
|
||||||
|
logger::log_info("Step 5: Building companies.jsonl (streaming)...").await;
|
||||||
|
let count = build_companies_jsonl_streaming(&paths, pool).await?;
|
||||||
|
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
||||||
|
|
||||||
// Step 6: Convert to simplified companies map and save incrementally
|
// Step 6: Process events (using index, not full load)
|
||||||
logger::log_info("Corporate Update: Building companies JSONL (streaming)...").await;
|
logger::log_info("Step 6: Processing events (using index)...").await;
|
||||||
|
let _event_index = build_event_index(&paths).await?;
|
||||||
|
logger::log_info(" ✓ Event index built").await;
|
||||||
|
|
||||||
|
logger::log_info("✓ Corporate update complete").await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stream companies.jsonl creation from securities cache - INCREMENTAL MODE
|
||||||
|
async fn build_companies_jsonl_streaming(paths: &DataPaths, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<usize> {
|
||||||
|
let path = DataPaths::new(".")?;
|
||||||
|
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
||||||
|
let securities_path = corporate_path.join("common_stocks.json");
|
||||||
|
|
||||||
|
if !securities_path.exists() {
|
||||||
|
logger::log_warn("No common_stocks.json found").await;
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load securities
|
||||||
|
let content = tokio::fs::read_to_string(securities_path).await?;
|
||||||
|
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
let companies_path = paths.data_dir().join("companies.jsonl");
|
let companies_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
|
||||||
// Create file and write incrementally
|
|
||||||
if let Some(parent) = companies_path.parent() {
|
if let Some(parent) = companies_path.parent() {
|
||||||
tokio::fs::create_dir_all(parent).await?;
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut file = tokio::fs::File::create(&companies_path).await?;
|
// Load existing companies into a map
|
||||||
let mut processed = 0;
|
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||||
|
|
||||||
for (name, company_info) in common_stocks.iter() {
|
if companies_path.exists() {
|
||||||
let mut isin_ticker_pairs: HashMap<String, String> = HashMap::new();
|
logger::log_info("Loading existing companies.jsonl...").await;
|
||||||
|
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
||||||
|
for line in existing_content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Failed to parse existing company line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger::log_info(&format!("Loaded {} existing companies", existing_companies.len())).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create temporary file for atomic write
|
||||||
|
let temp_path = companies_path.with_extension("jsonl.tmp");
|
||||||
|
let mut file = tokio::fs::File::create(&temp_path).await?;
|
||||||
|
let mut count = 0;
|
||||||
|
let mut updated_count = 0;
|
||||||
|
let mut new_count = 0;
|
||||||
|
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
|
||||||
|
for (name, company_info) in securities.iter() {
|
||||||
|
// Check if we already have this company
|
||||||
|
let existing_entry = existing_companies.remove(name);
|
||||||
|
let is_update = existing_entry.is_some();
|
||||||
|
|
||||||
|
// Start with existing ISIN-ticker map or create new one
|
||||||
|
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
||||||
|
existing_entry
|
||||||
|
.map(|e| e.isin_tickers_map)
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Step 1: Extract unique ISIN-ticker pairs from FigiInfo
|
||||||
|
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
||||||
|
|
||||||
for figi_infos in company_info.securities.values() {
|
for figi_infos in company_info.securities.values() {
|
||||||
for figi_info in figi_infos {
|
for figi_info in figi_infos {
|
||||||
if !figi_info.isin.is_empty() && !figi_info.ticker.is_empty() {
|
if !figi_info.isin.is_empty() {
|
||||||
isin_ticker_pairs.insert(figi_info.isin.clone(), figi_info.ticker.clone());
|
let tickers = unique_isin_ticker_pairs
|
||||||
|
.entry(figi_info.isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
// Add FIGI ticker if present and not duplicate
|
||||||
|
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
||||||
|
tickers.push(figi_info.ticker.clone());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !isin_ticker_pairs.is_empty() {
|
// Step 2: Merge FIGI tickers into main map
|
||||||
use tokio::io::AsyncWriteExt;
|
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
||||||
|
let tickers = isin_tickers_map
|
||||||
|
.entry(isin.clone())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
let line = serde_json::json!({
|
// Add FIGI tickers that aren't already present
|
||||||
"name": name,
|
for figi_ticker in figi_tickers {
|
||||||
"securities": isin_ticker_pairs
|
if !tickers.contains(&figi_ticker) {
|
||||||
});
|
tickers.push(figi_ticker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
file.write_all(line.to_string().as_bytes()).await?;
|
// Step 3: Check if we need to fetch Yahoo ticker for this ISIN
|
||||||
|
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
||||||
|
|
||||||
|
if !has_yahoo_ticker {
|
||||||
|
logger::log_info(&format!("Fetching Yahoo ticker for {} (ISIN: {})", name, isin)).await;
|
||||||
|
let yahoo_result = scrape_ticker_by_isin(pool, &isin).await;
|
||||||
|
|
||||||
|
match yahoo_result {
|
||||||
|
Ok(result) => {
|
||||||
|
let log_msg = match &result {
|
||||||
|
YahooTickerResult::Found(ticker) =>
|
||||||
|
format!("✓ Found Yahoo ticker {} for ISIN {}", ticker, isin),
|
||||||
|
YahooTickerResult::NoResults =>
|
||||||
|
format!("○ No search results for ISIN {}", isin),
|
||||||
|
YahooTickerResult::NotFound =>
|
||||||
|
format!("○ Empty ticker result for ISIN {}", isin),
|
||||||
|
YahooTickerResult::AmbiguousResults =>
|
||||||
|
format!("⚠ Ambiguous results for ISIN {}", isin),
|
||||||
|
};
|
||||||
|
|
||||||
|
if result.is_found() {
|
||||||
|
logger::log_info(&log_msg).await;
|
||||||
|
} else {
|
||||||
|
logger::log_warn(&log_msg).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
tickers.push(result.to_tagged_string());
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("✗ Yahoo lookup error for ISIN {}: {}", isin, e)).await;
|
||||||
|
tickers.push("YAHOO:ERROR".to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger::log_warn(&format!("Skipping Yahoo lookup for {} ISIN {} - already has Yahoo data", name, isin)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only write if we have ticker data
|
||||||
|
if !isin_tickers_map.is_empty() {
|
||||||
|
let company_entry = CompanyCrossPlatformInfo {
|
||||||
|
name: name.clone(),
|
||||||
|
isin_tickers_map,
|
||||||
|
};
|
||||||
|
|
||||||
|
let line = serde_json::to_string(&company_entry)?;
|
||||||
|
|
||||||
|
file.write_all(line.as_bytes()).await?;
|
||||||
file.write_all(b"\n").await?;
|
file.write_all(b"\n").await?;
|
||||||
processed += 1;
|
|
||||||
|
|
||||||
// Yield periodically
|
// Flush after each write for crash safety
|
||||||
if processed % 100 == 0 {
|
file.flush().await?;
|
||||||
|
|
||||||
|
count += 1;
|
||||||
|
if is_update {
|
||||||
|
updated_count += 1;
|
||||||
|
} else {
|
||||||
|
new_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if count % 10 == 0 {
|
||||||
|
logger::log_info(&format!("Progress: {} companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
logger::log_info(&format!("Saved {} companies so far...", processed)).await;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info(&format!("Corporate Update: Saved {} companies to JSONL", processed)).await;
|
// Write any remaining existing companies that weren't in securities
|
||||||
|
for (_name, company) in existing_companies {
|
||||||
// Step 7: Process events in streaming fashion
|
let line = serde_json::to_string(&company)?;
|
||||||
logger::log_info("Corporate Update: Processing events (streaming)...").await;
|
file.write_all(line.as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
file.flush().await?;
|
||||||
|
count += 1;
|
||||||
|
logger::log_warn(&format!("Preserved existing company: {}", _name)).await;
|
||||||
|
}
|
||||||
|
|
||||||
let event_index = build_event_index(&paths).await?;
|
// Ensure all data is written
|
||||||
logger::log_info(&format!("Corporate Update: Built index of {} events", event_index.len())).await;
|
file.sync_all().await?;
|
||||||
|
drop(file);
|
||||||
|
|
||||||
// For now, we just maintain the index
|
// Atomic rename: replace old file with new one
|
||||||
// In a full implementation, you'd stream through tickers and update events
|
tokio::fs::rename(&temp_path, &companies_path).await?;
|
||||||
|
|
||||||
// Step 8: Save any updates
|
logger::log_info(&format!("✓ Completed: {} total companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
||||||
logger::log_info("Corporate Update: Finalizing...").await;
|
|
||||||
|
Ok(count)
|
||||||
let msg = "✓ Corporate update complete (streaming)";
|
|
||||||
println!("{}", msg);
|
|
||||||
logger::log_info(msg).await;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper to find the most recent date directory in the FIGI cache
|
/// Find most recent FIGI date directory
|
||||||
async fn find_most_recent_date_dir(map_cache_dir: &std::path::Path) -> anyhow::Result<Option<std::path::PathBuf>> {
|
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
||||||
|
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
||||||
|
|
||||||
if !map_cache_dir.exists() {
|
if !map_cache_dir.exists() {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut entries = tokio::fs::read_dir(map_cache_dir).await?;
|
let mut entries = tokio::fs::read_dir(&map_cache_dir).await?;
|
||||||
let mut dates = Vec::new();
|
let mut dates = Vec::new();
|
||||||
|
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
if path.is_dir() {
|
if path.is_dir() {
|
||||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||||
// Date format: DDMMYYYY
|
|
||||||
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
|
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
|
||||||
dates.push((name.to_string(), path));
|
dates.push((name.to_string(), path));
|
||||||
}
|
}
|
||||||
@@ -176,67 +280,16 @@ async fn find_most_recent_date_dir(map_cache_dir: &std::path::Path) -> anyhow::R
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort by date (DDMMYYYY format)
|
dates.sort_by(|a, b| b.0.cmp(&a.0));
|
||||||
dates.sort_by(|a, b| b.0.cmp(&a.0)); // Descending order
|
|
||||||
|
|
||||||
Ok(Some(dates[0].1.clone()))
|
Ok(Some(dates[0].1.clone()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pub struct ProcessResult {
|
pub struct ProcessResult {
|
||||||
pub changes: Vec<CompanyEventChange>,
|
pub changes: Vec<CompanyEventChange>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Process events in batches to avoid memory buildup
|
|
||||||
pub async fn process_events_streaming(
|
|
||||||
index: &[EventIndex],
|
|
||||||
new_events: &[CompanyEvent],
|
|
||||||
today: &str,
|
|
||||||
) -> anyhow::Result<(Vec<CompanyEventChange>, Vec<CompanyEvent>)> {
|
|
||||||
let mut all_changes = Vec::new();
|
|
||||||
let mut final_events: HashMap<String, CompanyEvent> = HashMap::new();
|
|
||||||
|
|
||||||
// Step 1: Load existing events in batches using the index
|
|
||||||
logger::log_info("Loading existing events in batches...").await;
|
|
||||||
|
|
||||||
let mut loaded_files = std::collections::HashSet::new();
|
|
||||||
|
|
||||||
for entry in index {
|
|
||||||
if loaded_files.contains(&entry.file_path) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let content = tokio::fs::read_to_string(&entry.file_path).await?;
|
|
||||||
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
|
|
||||||
|
|
||||||
for e in events {
|
|
||||||
final_events.insert(event_key(&e), e);
|
|
||||||
}
|
|
||||||
|
|
||||||
loaded_files.insert(entry.file_path.clone());
|
|
||||||
|
|
||||||
if final_events.len() % 1000 == 0 {
|
|
||||||
logger::log_info(&format!("Loaded {} events so far...", final_events.len())).await;
|
|
||||||
tokio::task::yield_now().await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger::log_info(&format!("Loaded {} existing events", final_events.len())).await;
|
|
||||||
|
|
||||||
// Step 2: Process new events in batches
|
|
||||||
for (idx, batch) in new_events.chunks(500).enumerate() {
|
|
||||||
logger::log_info(&format!("Processing batch {} ({} events)", idx + 1, batch.len())).await;
|
|
||||||
|
|
||||||
let batch_result = process_batch(batch, &mut final_events, today);
|
|
||||||
all_changes.extend(batch_result.changes);
|
|
||||||
|
|
||||||
tokio::task::yield_now().await;
|
|
||||||
}
|
|
||||||
|
|
||||||
let events_vec: Vec<CompanyEvent> = final_events.into_values().collect();
|
|
||||||
|
|
||||||
Ok((all_changes, events_vec))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn process_batch(
|
pub fn process_batch(
|
||||||
new_events: &[CompanyEvent],
|
new_events: &[CompanyEvent],
|
||||||
existing: &mut HashMap<String, CompanyEvent>,
|
existing: &mut HashMap<String, CompanyEvent>,
|
||||||
@@ -253,7 +306,6 @@ pub fn process_batch(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for time change on same date
|
|
||||||
let date_key = format!("{}|{}", new.ticker, new.date);
|
let date_key = format!("{}|{}", new.ticker, new.date);
|
||||||
let mut found_old = None;
|
let mut found_old = None;
|
||||||
for (k, e) in existing.iter() {
|
for (k, e) in existing.iter() {
|
||||||
|
|||||||
312
src/corporate/yahoo.rs
Normal file
312
src/corporate/yahoo.rs
Normal file
@@ -0,0 +1,312 @@
|
|||||||
|
// src/corporate/yahoo.rs
|
||||||
|
use super::{types::*, helpers::*};
|
||||||
|
use crate::{scraper::webdriver::*, util::{directories::DataPaths}};
|
||||||
|
use event_backtest_engine::logger;
|
||||||
|
use fantoccini::{Client, Locator};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use tokio::{time::{Duration as TokioDuration, sleep}};
|
||||||
|
use std::{sync::Arc};
|
||||||
|
use anyhow::{anyhow, Result};
|
||||||
|
|
||||||
|
/// Mapping existing
|
||||||
|
|
||||||
|
/// getting historical stock price data daily (xxxx - 2025) and hourly (last 30 days)
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub enum YahooTickerResult {
|
||||||
|
Found(String),
|
||||||
|
NotFound,
|
||||||
|
NoResults,
|
||||||
|
AmbiguousResults,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl YahooTickerResult {
|
||||||
|
pub fn to_tagged_string(&self) -> String {
|
||||||
|
match self {
|
||||||
|
YahooTickerResult::Found(ticker) => format!("YAHOO:{}", ticker),
|
||||||
|
YahooTickerResult::NotFound => "YAHOO:NOT_FOUND".to_string(),
|
||||||
|
YahooTickerResult::NoResults => "YAHOO:NO_RESULTS".to_string(),
|
||||||
|
YahooTickerResult::AmbiguousResults => "YAHOO:AMBIGUOUS".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_found(&self) -> bool {
|
||||||
|
matches!(self, YahooTickerResult::Found(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_ticker(&self) -> Option<&str> {
|
||||||
|
match self {
|
||||||
|
YahooTickerResult::Found(ticker) => Some(ticker),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scrape_ticker_by_isin(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
isin: &str,
|
||||||
|
) -> anyhow::Result<YahooTickerResult> {
|
||||||
|
let isin = isin.to_string();
|
||||||
|
pool.execute(format!("https://finance.yahoo.com/lookup?s={}", isin), move |client| {
|
||||||
|
let isin = isin.clone();
|
||||||
|
Box::pin(async move {
|
||||||
|
sleep(TokioDuration::from_millis(1000)).await;
|
||||||
|
reject_yahoo_cookies(&client).await?;
|
||||||
|
sleep(TokioDuration::from_millis(1000)).await;
|
||||||
|
extract_ticker_by_isin(&client, &isin).await
|
||||||
|
})
|
||||||
|
}).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract_ticker_by_isin(
|
||||||
|
client: &Client,
|
||||||
|
_isin: &str,
|
||||||
|
) -> Result<YahooTickerResult> {
|
||||||
|
//let search_url = format!("https://finance.yahoo.com/lookup?s={}", isin);
|
||||||
|
|
||||||
|
// Check for "No results found" message
|
||||||
|
if client.find(Locator::Css(".noData")).await.is_ok() {
|
||||||
|
return Ok(YahooTickerResult::NoResults);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for results table
|
||||||
|
let table = match client
|
||||||
|
.wait()
|
||||||
|
.for_element(Locator::Css("table[data-test='lookup-table']"))
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(t) => t,
|
||||||
|
Err(_) => return Ok(YahooTickerResult::NoResults),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Find first row
|
||||||
|
let first_row = match table
|
||||||
|
.find(Locator::Css("tbody tr"))
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(row) => row,
|
||||||
|
Err(_) => return Ok(YahooTickerResult::NoResults),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract ticker from first cell
|
||||||
|
let ticker_cell = first_row
|
||||||
|
.find(Locator::Css("td:nth-child(1)"))
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to find ticker cell: {}", e))?;
|
||||||
|
|
||||||
|
let ticker = ticker_cell
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get ticker text: {}", e))?
|
||||||
|
.trim()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
if ticker.is_empty() {
|
||||||
|
Ok(YahooTickerResult::NotFound)
|
||||||
|
} else {
|
||||||
|
Ok(YahooTickerResult::Found(ticker))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::Result<Vec<String>> {
|
||||||
|
let corporate_path = paths.data_dir().join("corporate").join("by_name");
|
||||||
|
let companies_file = corporate_path.join("companies.jsonl");
|
||||||
|
let content = tokio::fs::read_to_string(companies_file).await?;
|
||||||
|
let mut tickers = Vec::new();
|
||||||
|
for line in content.lines() {
|
||||||
|
let company: CompanyCrossPlatformInfo = serde_json::from_str(line)?;
|
||||||
|
for (_isin, ticker_vec) in company.isin_tickers_map {
|
||||||
|
tickers.extend(ticker_vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(tickers)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
||||||
|
///
|
||||||
|
/// This function creates and executes a ScrapeTask to navigate to the Yahoo Finance earnings calendar,
|
||||||
|
/// reject cookies, and extract the events.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `ticker` - The stock ticker symbol.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A vector of CompanyEvent structs on success.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if the task execution fails, e.g., chromedriver spawn or navigation issues.
|
||||||
|
pub async fn fetch_earnings_with_pool(
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
ticker: &str,
|
||||||
|
) -> anyhow::Result<Vec<CompanyEvent>> {
|
||||||
|
let ticker = ticker.to_string();
|
||||||
|
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker);
|
||||||
|
|
||||||
|
let ticker_cloned = ticker.clone();
|
||||||
|
|
||||||
|
pool.execute(url, move |client| {
|
||||||
|
let ticker = ticker_cloned.clone();
|
||||||
|
Box::pin(async move {
|
||||||
|
reject_yahoo_cookies(&client).await?;
|
||||||
|
extract_earnings_events(&client, &ticker).await
|
||||||
|
})
|
||||||
|
}).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extracts earnings events from the currently loaded Yahoo Finance earnings calendar page.
|
||||||
|
///
|
||||||
|
/// This function assumes the client is already navigated to the correct URL (e.g.,
|
||||||
|
/// https://finance.yahoo.com/calendar/earnings?symbol={ticker}) and cookies are handled.
|
||||||
|
///
|
||||||
|
/// It waits for the earnings table, extracts rows, parses cells into CompanyEvent structs,
|
||||||
|
/// and handles date parsing, float parsing, and optional fields.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `client` - The fantoccini Client with the page loaded.
|
||||||
|
/// * `ticker` - The stock ticker symbol for the events.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A vector of CompanyEvent on success.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if:
|
||||||
|
/// - Table or elements not found.
|
||||||
|
/// - Date or float parsing fails.
|
||||||
|
/// - WebDriver operations fail.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```no_run
|
||||||
|
/// use fantoccini::Client;
|
||||||
|
/// use crate::corporate::scraper::extract_earnings;
|
||||||
|
///
|
||||||
|
/// #[tokio::main]
|
||||||
|
/// async fn main() -> Result<()> {
|
||||||
|
/// // Assume client is set up and navigated
|
||||||
|
/// let events = extract_earnings(&client, "AAPL").await?;
|
||||||
|
/// Ok(())
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Vec<CompanyEvent>> {
|
||||||
|
// Wait for the table to load
|
||||||
|
let table = client
|
||||||
|
.wait()
|
||||||
|
.for_element(Locator::Css(r#"table[data-test="cal-table"]"#))
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to find earnings table: {}", e))?;
|
||||||
|
|
||||||
|
// Find all rows in tbody
|
||||||
|
let rows = table
|
||||||
|
.find_all(Locator::Css("tbody tr"))
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to find table rows: {}", e))?;
|
||||||
|
|
||||||
|
let mut events = Vec::with_capacity(rows.len());
|
||||||
|
|
||||||
|
for row in rows {
|
||||||
|
let cells = row
|
||||||
|
.find_all(Locator::Css("td"))
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to find cells in row: {}", e))?;
|
||||||
|
|
||||||
|
if cells.len() < 5 {
|
||||||
|
continue; // Skip incomplete rows
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract and parse date
|
||||||
|
let date_str = cells[0]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get date text: {}", e))?;
|
||||||
|
let date = parse_yahoo_date(&date_str)
|
||||||
|
.map_err(|e| anyhow!("Failed to parse date '{}': {}", date_str, e))?
|
||||||
|
.format("%Y-%m-%d")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
// Extract time, replace "Time Not Supplied" with empty
|
||||||
|
let time = cells[1]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get time text: {}", e))?
|
||||||
|
.replace("Time Not Supplied", "");
|
||||||
|
|
||||||
|
// Extract period
|
||||||
|
let period = cells[2]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get period text: {}", e))?;
|
||||||
|
|
||||||
|
// Parse EPS forecast
|
||||||
|
let eps_forecast_str = cells[3]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get EPS forecast text: {}", e))?;
|
||||||
|
let eps_forecast = parse_float(&eps_forecast_str);
|
||||||
|
|
||||||
|
// Parse EPS actual
|
||||||
|
let eps_actual_str = cells[4]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get EPS actual text: {}", e))?;
|
||||||
|
let eps_actual = parse_float(&eps_actual_str);
|
||||||
|
|
||||||
|
// Parse surprise % if available
|
||||||
|
let surprise_pct = if cells.len() > 5 {
|
||||||
|
let surprise_str = cells[5]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Failed to get surprise text: {}", e))?;
|
||||||
|
parse_float(&surprise_str)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
events.push(CompanyEvent {
|
||||||
|
ticker: ticker.to_string(),
|
||||||
|
date,
|
||||||
|
time,
|
||||||
|
period,
|
||||||
|
eps_forecast,
|
||||||
|
eps_actual,
|
||||||
|
revenue_forecast: None,
|
||||||
|
revenue_actual: None,
|
||||||
|
surprise_pct,
|
||||||
|
source: "Yahoo".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if events.is_empty() {
|
||||||
|
eprintln!("Warning: No earnings events extracted for ticker {}", ticker);
|
||||||
|
} else {
|
||||||
|
println!("Extracted {} earnings events for {}", events.len(), ticker);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(events)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rejecting Yahoo Cookies
|
||||||
|
async fn reject_yahoo_cookies(client: &Client) -> anyhow::Result<()> {
|
||||||
|
for _ in 0..10 {
|
||||||
|
let clicked: bool = client
|
||||||
|
.execute(
|
||||||
|
r#"(() => {
|
||||||
|
const btn = document.querySelector('#consent-page .reject-all');
|
||||||
|
if (btn) {
|
||||||
|
btn.click();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
})()"#,
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.as_bool()
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if clicked { break; }
|
||||||
|
sleep(TokioDuration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("Rejected Yahoo cookies if button existed").await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -116,7 +116,7 @@ pub async fn build_event_index(chunks: &[ChunkInfo]) -> anyhow::Result<Vec<Event
|
|||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// NEW: Look up a specific event by loading only its chunk
|
/// Look up a specific event by loading only its chunk
|
||||||
pub async fn lookup_event_by_key(key: &str, index: &[EventIndex]) -> anyhow::Result<Option<EconomicEvent>> {
|
pub async fn lookup_event_by_key(key: &str, index: &[EventIndex]) -> anyhow::Result<Option<EconomicEvent>> {
|
||||||
// Find which chunk contains this event
|
// Find which chunk contains this event
|
||||||
let entry = index.iter().find(|e| e.key == key);
|
let entry = index.iter().find(|e| e.key == key);
|
||||||
|
|||||||
14
src/main.rs
14
src/main.rs
@@ -14,11 +14,6 @@ use util::directories::DataPaths;
|
|||||||
use util::{logger, opnv};
|
use util::{logger, opnv};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
/// Application entry point
|
|
||||||
// src/main.rs
|
|
||||||
|
|
||||||
// ... existing imports ...
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
cleanup_all_proxy_containers().await.ok();
|
cleanup_all_proxy_containers().await.ok();
|
||||||
@@ -138,7 +133,7 @@ async fn main() -> Result<()> {
|
|||||||
std::process::exit(0);
|
std::process::exit(0);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// === Step 4: Run the actual scraping jobs ===
|
// === Step 4: Run the actual scraping jobs ===
|
||||||
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
||||||
economic::run_full_update(&config, &pool).await?;
|
economic::run_full_update(&config, &pool).await?;
|
||||||
@@ -161,9 +156,4 @@ async fn main() -> Result<()> {
|
|||||||
|
|
||||||
logger::log_info("=== Application finished successfully ===").await;
|
logger::log_info("=== Application finished successfully ===").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
memory allocation of 4294967296 bytes failed
|
|
||||||
error: process didn't exit successfully: `target\debug\event_backtest_engine.exe` (exit code: 0xc0000409, STATUS_STACK_BUFFER_OVERRUN)
|
|
||||||
*/
|
|
||||||
@@ -1,379 +0,0 @@
|
|||||||
// tests/vpn_integration_tests.rs
|
|
||||||
//! Integration tests for VPN rotation system
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod vpn_tests {
|
|
||||||
use event_backtest_engine::{
|
|
||||||
scraper::{
|
|
||||||
webdriver::ChromeDriverPool,
|
|
||||||
vpn_manager::{VpnInstance, VpnPool},
|
|
||||||
},
|
|
||||||
util::{directories::DataPaths, opnv},
|
|
||||||
};
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
/// Helper to create a test VPN instance without connecting
|
|
||||||
fn create_test_vpn_instance() -> VpnInstance {
|
|
||||||
VpnInstance::new(
|
|
||||||
PathBuf::from("test.ovpn"),
|
|
||||||
"testuser".to_string(),
|
|
||||||
"testpass".to_string(),
|
|
||||||
)
|
|
||||||
.expect("Failed to create test VPN instance")
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_vpn_instance_creation() {
|
|
||||||
let vpn = create_test_vpn_instance();
|
|
||||||
assert_eq!(vpn.hostname(), "test");
|
|
||||||
assert!(!vpn.is_healthy());
|
|
||||||
assert!(vpn.external_ip().is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_vpn_task_counting() {
|
|
||||||
let mut vpn = create_test_vpn_instance();
|
|
||||||
|
|
||||||
// Should not rotate initially
|
|
||||||
assert!(!vpn.increment_task_count(10));
|
|
||||||
|
|
||||||
// Increment tasks
|
|
||||||
for i in 1..10 {
|
|
||||||
assert!(!vpn.increment_task_count(10), "Should not rotate at task {}", i);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Should rotate at threshold
|
|
||||||
assert!(vpn.increment_task_count(10), "Should rotate at task 10");
|
|
||||||
|
|
||||||
// Reset and verify
|
|
||||||
vpn.reset_task_count();
|
|
||||||
assert!(!vpn.increment_task_count(10), "Should not rotate after reset");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_vpn_task_counting_zero_threshold() {
|
|
||||||
let mut vpn = create_test_vpn_instance();
|
|
||||||
|
|
||||||
// With threshold=0, should never auto-rotate
|
|
||||||
for _ in 0..100 {
|
|
||||||
assert!(!vpn.increment_task_count(0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_chromedriver_pool_creation_no_vpn() {
|
|
||||||
let result = ChromeDriverPool::new(2).await;
|
|
||||||
|
|
||||||
match result {
|
|
||||||
Ok(pool) => {
|
|
||||||
assert_eq!(pool.get_number_of_instances(), 2);
|
|
||||||
assert!(!pool.is_vpn_enabled());
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("ChromeDriver pool creation failed (expected if chromedriver not installed): {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_data_paths_creation() {
|
|
||||||
let paths = DataPaths::new("./test_data").expect("Failed to create paths");
|
|
||||||
|
|
||||||
assert!(paths.data_dir().exists());
|
|
||||||
assert!(paths.cache_dir().exists());
|
|
||||||
assert!(paths.logs_dir().exists());
|
|
||||||
assert!(paths.cache_openvpn_dir().exists());
|
|
||||||
|
|
||||||
// Cleanup
|
|
||||||
let _ = std::fs::remove_dir_all("./test_data");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
#[ignore] // This test requires actual network access and VPNBook availability
|
|
||||||
async fn test_fetch_vpnbook_configs() {
|
|
||||||
let paths = DataPaths::new(".").expect("Failed to create paths");
|
|
||||||
|
|
||||||
// This test requires a ChromeDriver pool
|
|
||||||
let pool_result = ChromeDriverPool::new(1).await;
|
|
||||||
if pool_result.is_err() {
|
|
||||||
eprintln!("Skipping VPNBook fetch test: ChromeDriver not available");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let pool = Arc::new(pool_result.unwrap());
|
|
||||||
|
|
||||||
let result = opnv::fetch_vpnbook_configs(&pool, paths.cache_dir()).await;
|
|
||||||
|
|
||||||
match result {
|
|
||||||
Ok((username, password, files)) => {
|
|
||||||
assert!(!username.is_empty(), "Username should not be empty");
|
|
||||||
assert!(!password.is_empty(), "Password should not be empty");
|
|
||||||
assert!(!files.is_empty(), "Should fetch at least one config file");
|
|
||||||
|
|
||||||
println!("Fetched {} VPN configs", files.len());
|
|
||||||
for file in &files {
|
|
||||||
assert!(file.exists(), "Config file should exist: {:?}", file);
|
|
||||||
assert_eq!(file.extension().and_then(|s| s.to_str()), Some("ovpn"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("VPNBook fetch failed (may be temporary): {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
#[ignore] // Requires actual VPN configs and OpenVPN installation
|
|
||||||
async fn test_vpn_pool_creation() {
|
|
||||||
let paths = DataPaths::new(".").expect("Failed to create paths");
|
|
||||||
|
|
||||||
// First fetch configs
|
|
||||||
let pool_result = ChromeDriverPool::new(1).await;
|
|
||||||
if pool_result.is_err() {
|
|
||||||
eprintln!("Skipping VPN pool test: ChromeDriver not available");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let temp_pool = Arc::new(pool_result.unwrap());
|
|
||||||
let fetch_result = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await;
|
|
||||||
|
|
||||||
if fetch_result.is_err() {
|
|
||||||
eprintln!("Skipping VPN pool test: Could not fetch configs");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let (username, password, _) = fetch_result.unwrap();
|
|
||||||
|
|
||||||
// Create VPN pool
|
|
||||||
let vpn_pool_result = VpnPool::new(
|
|
||||||
paths.cache_openvpn_dir(),
|
|
||||||
username,
|
|
||||||
password,
|
|
||||||
false,
|
|
||||||
0,
|
|
||||||
).await;
|
|
||||||
|
|
||||||
match vpn_pool_result {
|
|
||||||
Ok(vpn_pool) => {
|
|
||||||
assert!(vpn_pool.len() > 0, "VPN pool should have at least one instance");
|
|
||||||
println!("Created VPN pool with {} instances", vpn_pool.len());
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("VPN pool creation failed: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
#[ignore] // Full integration test - requires all components
|
|
||||||
async fn test_full_vpn_integration() {
|
|
||||||
let paths = DataPaths::new(".").expect("Failed to create paths");
|
|
||||||
|
|
||||||
// Step 1: Create temp ChromeDriver pool for fetching
|
|
||||||
let temp_pool = match ChromeDriverPool::new(1).await {
|
|
||||||
Ok(p) => Arc::new(p),
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Skipping integration test: ChromeDriver not available - {}", e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Step 2: Fetch VPNBook configs
|
|
||||||
let (username, password, files) = match opnv::fetch_vpnbook_configs(
|
|
||||||
&temp_pool,
|
|
||||||
paths.cache_dir()
|
|
||||||
).await {
|
|
||||||
Ok(result) => result,
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Skipping integration test: Config fetch failed - {}", e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
assert!(!files.is_empty(), "Should have fetched configs");
|
|
||||||
|
|
||||||
// Step 3: Create VPN pool
|
|
||||||
let vpn_pool = match VpnPool::new(
|
|
||||||
paths.cache_openvpn_dir(),
|
|
||||||
username,
|
|
||||||
password,
|
|
||||||
true,
|
|
||||||
5,
|
|
||||||
).await {
|
|
||||||
Ok(pool) => Arc::new(pool),
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Skipping integration test: VPN pool creation failed - {}", e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Step 4: Connect one VPN
|
|
||||||
let vpn_instance = vpn_pool.acquire().await.expect("Failed to acquire VPN");
|
|
||||||
let connect_result = {
|
|
||||||
let mut vpn = vpn_instance.lock().await;
|
|
||||||
vpn.connect().await
|
|
||||||
};
|
|
||||||
|
|
||||||
match connect_result {
|
|
||||||
Ok(_) => {
|
|
||||||
let vpn = vpn_instance.lock().await;
|
|
||||||
println!("✓ VPN connected: {} ({})",
|
|
||||||
vpn.hostname(),
|
|
||||||
vpn.external_ip().unwrap_or("unknown")
|
|
||||||
);
|
|
||||||
assert!(vpn.is_healthy());
|
|
||||||
assert!(vpn.external_ip().is_some());
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("VPN connection failed: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step 5: Create ChromeDriver pool with VPN
|
|
||||||
let driver_pool_result = ChromeDriverPool::new_with_vpn(
|
|
||||||
1,
|
|
||||||
Some(vpn_pool.clone())
|
|
||||||
).await;
|
|
||||||
|
|
||||||
match driver_pool_result {
|
|
||||||
Ok(driver_pool) => {
|
|
||||||
assert!(driver_pool.is_vpn_enabled());
|
|
||||||
println!("✓ ChromeDriver pool created with VPN binding");
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("ChromeDriver pool creation failed: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step 6: Cleanup
|
|
||||||
vpn_pool.disconnect_all().await.expect("Failed to disconnect VPNs");
|
|
||||||
println!("✓ Integration test complete");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_hostname_extraction() {
|
|
||||||
// Test the hostname extraction logic
|
|
||||||
let test_cases = vec![
|
|
||||||
("test/ca149.vpnbook.com/config.ovpn", "ca149.vpnbook.com"),
|
|
||||||
("test/us1.vpnbook.com/config.ovpn", "us1.vpnbook.com"),
|
|
||||||
("test/de4.vpnbook.com/config.ovpn", "de4.vpnbook.com"),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (path, expected_hostname) in test_cases {
|
|
||||||
let pb = PathBuf::from(path);
|
|
||||||
let hostname = pb.parent()
|
|
||||||
.and_then(|p| p.file_name())
|
|
||||||
.and_then(|n| n.to_str())
|
|
||||||
.unwrap_or("unknown");
|
|
||||||
|
|
||||||
assert_eq!(hostname, expected_hostname);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(target_os = "windows")]
|
|
||||||
#[test]
|
|
||||||
fn test_forcebindip_manager_creation() {
|
|
||||||
use event_backtest_engine::ForceBindIpManager;
|
|
||||||
|
|
||||||
match ForceBindIpManager::new() {
|
|
||||||
Ok(manager) => {
|
|
||||||
println!("✓ ForceBindIP found at: {:?}", manager.path());
|
|
||||||
assert!(manager.path().exists());
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("ForceBindIP not found (expected in dev): {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(target_os = "windows")]
|
|
||||||
#[test]
|
|
||||||
fn test_forcebindip_command_creation() {
|
|
||||||
use event_backtest_engine::ForceBindIpManager;
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
if let Ok(manager) = ForceBindIpManager::new() {
|
|
||||||
let cmd = manager.create_bound_command(
|
|
||||||
"192.168.1.100",
|
|
||||||
Path::new("test.exe"),
|
|
||||||
&["--arg1", "value1"],
|
|
||||||
);
|
|
||||||
|
|
||||||
let cmd_str = format!("{:?}", cmd);
|
|
||||||
assert!(cmd_str.contains("192.168.1.100"));
|
|
||||||
assert!(cmd_str.contains("test.exe"));
|
|
||||||
println!("✓ ForceBindIP command created successfully");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_config_defaults() {
|
|
||||||
use event_backtest_engine::Config;
|
|
||||||
|
|
||||||
let config = Config::default();
|
|
||||||
assert_eq!(config.economic_start_date, "2007-02-13");
|
|
||||||
assert_eq!(config.corporate_start_date, "2010-01-01");
|
|
||||||
assert_eq!(config.economic_lookahead_months, 3);
|
|
||||||
assert_eq!(config.max_parallel_instances, 10);
|
|
||||||
assert!(!config.enable_vpn_rotation);
|
|
||||||
assert_eq!(config.tasks_per_vpn_session, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod benchmark_tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
#[ignore] // Performance test
|
|
||||||
async fn benchmark_vpn_rotation_overhead() {
|
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
// This test measures the overhead of VPN rotation
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
// Simulate rotation cycle
|
|
||||||
// 1. Disconnect (instant)
|
|
||||||
// 2. Wait 2 seconds
|
|
||||||
// 3. Connect (5-10 seconds)
|
|
||||||
// 4. Verify IP (1-2 seconds)
|
|
||||||
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
println!("Rotation cycle took: {:?}", elapsed);
|
|
||||||
|
|
||||||
// Typical rotation should complete in under 15 seconds
|
|
||||||
assert!(elapsed.as_secs() < 15);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
#[ignore] // Performance test
|
|
||||||
async fn benchmark_parallel_scraping() {
|
|
||||||
// This test measures throughput with different parallelism levels
|
|
||||||
// Results help tune MAX_PARALLEL_INSTANCES
|
|
||||||
|
|
||||||
let configs = vec![1, 2, 3, 5, 10];
|
|
||||||
|
|
||||||
for &pool_size in &configs {
|
|
||||||
println!("Testing with {} parallel instances...", pool_size);
|
|
||||||
|
|
||||||
// Would need actual scraping implementation here
|
|
||||||
// For now, just verify pool creation time
|
|
||||||
let start = std::time::Instant::now();
|
|
||||||
|
|
||||||
let pool_result = event_backtest_engine::ChromeDriverPool::new(pool_size).await;
|
|
||||||
|
|
||||||
if let Ok(_pool) = pool_result {
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
println!(" Pool initialization: {:?}", elapsed);
|
|
||||||
|
|
||||||
// Pool creation should be fast (< 5 seconds per instance)
|
|
||||||
assert!(elapsed.as_secs() < pool_size as u64 * 5);
|
|
||||||
} else {
|
|
||||||
eprintln!(" Skipped - ChromeDriver not available");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user