Compare commits
10 Commits
b0a471ea84
...
2416947e9d
| Author | SHA1 | Date | |
|---|---|---|---|
| 2416947e9d | |||
| 3ab5d0dcc3 | |||
| c2408d9a56 | |||
| f95e9e2427 | |||
| c00bfd8687 | |||
| 0f89c8c0ce | |||
| a6823dc938 | |||
| 58a498e694 | |||
| f7083bf9f0 | |||
| f05df0b5ee |
48
.env.example
Normal file
48
.env.example
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
# WebScraper Configuration File (.env)
|
||||||
|
# ====================================
|
||||||
|
# This file configures the behavior of the WebScraper application
|
||||||
|
# Copy to .env and adjust values as needed
|
||||||
|
|
||||||
|
# ===== ECONOMIC DATA =====
|
||||||
|
# Start date for economic event scraping
|
||||||
|
ECONOMIC_START_DATE=2007-02-13
|
||||||
|
|
||||||
|
# How far into the future to look ahead for economic events (in months)
|
||||||
|
ECONOMIC_LOOKAHEAD_MONTHS=3
|
||||||
|
|
||||||
|
# ===== CORPORATE DATA =====
|
||||||
|
# Start date for corporate earnings/data scraping
|
||||||
|
CORPORATE_START_DATE=2010-01-01
|
||||||
|
|
||||||
|
# ===== PERFORMANCE & CONCURRENCY =====
|
||||||
|
# Maximum number of parallel ChromeDriver instances
|
||||||
|
# Higher = more concurrent tasks, but higher resource usage
|
||||||
|
MAX_PARALLEL_INSTANCES=3
|
||||||
|
|
||||||
|
# Maximum tasks per ChromeDriver instance before recycling
|
||||||
|
# 0 = unlimited (instance lives for entire application runtime)
|
||||||
|
MAX_TASKS_PER_INSTANCE=0
|
||||||
|
|
||||||
|
# ===== VPN ROTATION (ProtonVPN Integration) =====
|
||||||
|
# Enable automatic VPN rotation between sessions?
|
||||||
|
# If false, all traffic goes through system without VPN tunneling
|
||||||
|
ENABLE_VPN_ROTATION=false
|
||||||
|
|
||||||
|
# Comma-separated list of ProtonVPN servers to rotate through
|
||||||
|
# Examples:
|
||||||
|
# "US-Free#1,US-Free#2,UK-Free#1"
|
||||||
|
# "US,UK,JP,DE,NL"
|
||||||
|
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
||||||
|
VPN_SERVERS=
|
||||||
|
|
||||||
|
# Number of tasks per VPN session before rotating to new server/IP
|
||||||
|
# 0 = rotate between economic and corporate phases (one phase = one IP)
|
||||||
|
# 5 = rotate every 5 tasks
|
||||||
|
# NOTE: Must have ENABLE_VPN_ROTATION=true for this to take effect
|
||||||
|
TASKS_PER_VPN_SESSION=0
|
||||||
|
|
||||||
|
# ===== LOGGING =====
|
||||||
|
# Set via RUST_LOG environment variable:
|
||||||
|
# RUST_LOG=info cargo run
|
||||||
|
# RUST_LOG=debug cargo run
|
||||||
|
# Leave empty or unset for default logging level
|
||||||
21
.gitignore
vendored
21
.gitignore
vendored
@@ -27,10 +27,17 @@ target/
|
|||||||
|
|
||||||
# /chromedriver-win64/*
|
# /chromedriver-win64/*
|
||||||
|
|
||||||
# data folders
|
# data files
|
||||||
/economic_events*
|
**/*.json
|
||||||
/economic_event_changes*
|
**/*.jsonl
|
||||||
/corporate_events*
|
**/*.csv
|
||||||
/corporate_prices*
|
**/*.zip
|
||||||
/corporate_event_changes*
|
**/*.log
|
||||||
/data*
|
**/*.ovpn
|
||||||
|
|
||||||
|
#/economic_events*
|
||||||
|
#/economic_event_changes*
|
||||||
|
#/corporate_events*
|
||||||
|
#/corporate_prices*
|
||||||
|
#/corporate_event_changes*
|
||||||
|
#/data*
|
||||||
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -671,8 +671,10 @@ dependencies = [
|
|||||||
"fantoccini",
|
"fantoccini",
|
||||||
"flate2",
|
"flate2",
|
||||||
"futures",
|
"futures",
|
||||||
|
"once_cell",
|
||||||
"rand 0.9.2",
|
"rand 0.9.2",
|
||||||
"rayon",
|
"rayon",
|
||||||
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"scraper",
|
"scraper",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -681,6 +683,7 @@ dependencies = [
|
|||||||
"toml",
|
"toml",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
|
"url",
|
||||||
"yfinance-rs",
|
"yfinance-rs",
|
||||||
"zip",
|
"zip",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "
|
|||||||
scraper = "0.19" # HTML parsing for Yahoo earnings pages
|
scraper = "0.19" # HTML parsing for Yahoo earnings pages
|
||||||
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
|
fantoccini = { version = "0.20", features = ["rustls-tls"] } # Headless Chrome for finanzen.net
|
||||||
yfinance-rs = "0.7.2"
|
yfinance-rs = "0.7.2"
|
||||||
|
url = "2.5.7"
|
||||||
|
|
||||||
# Serialization
|
# Serialization
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
@@ -29,6 +30,9 @@ csv = "1.3"
|
|||||||
zip = "6.0.0"
|
zip = "6.0.0"
|
||||||
flate2 = "1.1.5"
|
flate2 = "1.1.5"
|
||||||
|
|
||||||
|
#
|
||||||
|
regex = "1.12.2"
|
||||||
|
|
||||||
# Generating
|
# Generating
|
||||||
rand = "0.9.2"
|
rand = "0.9.2"
|
||||||
|
|
||||||
@@ -45,6 +49,7 @@ anyhow = "1.0"
|
|||||||
# Logging (optional but recommended)
|
# Logging (optional but recommended)
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||||
|
once_cell = "1.21.3"
|
||||||
|
|
||||||
# Parallel processing (for batch tickers)
|
# Parallel processing (for batch tickers)
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
|
|||||||
15
cache/openfigi/INFO.md
vendored
Normal file
15
cache/openfigi/INFO.md
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Openfigi Data
|
||||||
|
|
||||||
|
## Market Security Description
|
||||||
|
| Code | Meaning |
|
||||||
|
| ---------- | --------------------------------------------------------- |
|
||||||
|
| **Comdty** | Commodity (e.g., oil, gold futures, physical commodities) |
|
||||||
|
| **Corp** | Corporate bond / corporate debt security |
|
||||||
|
| **Curncy** | Currency or FX pair (e.g., EURUSD) |
|
||||||
|
| **Equity** | Stocks / shares |
|
||||||
|
| **Govt** | Government bond (Treasuries, Bunds, Gilts, etc.) |
|
||||||
|
| **Index** | Market indices (S&P 500, DAX, NYSE Composite…) |
|
||||||
|
| **M-Mkt** | Money market instruments (commercial paper, CDs, T-bills) |
|
||||||
|
| **Mtge** | Mortgage-backed securities (MBS) |
|
||||||
|
| **Muni** | Municipal bonds (US state/local government debt) |
|
||||||
|
| **Pfd** | Preferred shares |
|
||||||
15
data/INFO.md
Normal file
15
data/INFO.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Global Data Info
|
||||||
|
|
||||||
|
## Exchanges
|
||||||
|
|
||||||
|
Source: Wikipedia
|
||||||
|
|
||||||
|
## Gleif
|
||||||
|
|
||||||
|
Data Download [.zip] over Website
|
||||||
|
|
||||||
|
## OpenFigi
|
||||||
|
|
||||||
|
Data Scraping over open API
|
||||||
|
|
||||||
|
Api Key: .env
|
||||||
6
data/economic/INFO.md
Normal file
6
data/economic/INFO.md
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# Economic Info
|
||||||
|
|
||||||
|
## Sources
|
||||||
|
|
||||||
|
* continents: finanzen.net
|
||||||
|
* countries: finanzen.net
|
||||||
@@ -12,21 +12,51 @@ pub struct Config {
|
|||||||
pub economic_lookahead_months: u32, // default: 3
|
pub economic_lookahead_months: u32, // default: 3
|
||||||
/// Maximum number of parallel scraping tasks (default: 10).
|
/// Maximum number of parallel scraping tasks (default: 10).
|
||||||
/// This limits concurrency to protect system load and prevent website spamming.
|
/// This limits concurrency to protect system load and prevent website spamming.
|
||||||
#[serde(default = "default_max_parallel")]
|
#[serde(default = "default_max_parallel_instances")]
|
||||||
pub max_parallel_tasks: usize,
|
pub max_parallel_instances: usize,
|
||||||
|
|
||||||
|
pub max_tasks_per_instance: usize,
|
||||||
|
|
||||||
|
/// VPN rotation configuration
|
||||||
|
/// If set to "true", enables automatic VPN rotation between sessions
|
||||||
|
#[serde(default)]
|
||||||
|
pub enable_vpn_rotation: bool,
|
||||||
|
|
||||||
|
/// Comma-separated list of VPN servers/country codes to rotate through.
|
||||||
|
/// Example: "US-Free#1,UK-Free#1,JP-Free#1" or "US,JP,DE"
|
||||||
|
/// If empty, VPN rotation is disabled.
|
||||||
|
#[serde(default)]
|
||||||
|
pub vpn_servers: String,
|
||||||
|
|
||||||
|
/// Number of tasks per session before rotating VPN
|
||||||
|
/// If set to 0, rotates VPN between economic and corporate phases
|
||||||
|
#[serde(default = "default_tasks_per_session")]
|
||||||
|
pub tasks_per_vpn_session: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_max_parallel() -> usize {
|
fn default_max_parallel_instances() -> usize {
|
||||||
10
|
10
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn default_tasks_per_session() -> usize {
|
||||||
|
0 // 0 = rotate between economic/corporate
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_protonvpn_extension_id() -> String {
|
||||||
|
"ghmbeldphafepmbegfdlkpapadhbakde".to_string()
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
economic_start_date: "2007-02-13".to_string(),
|
economic_start_date: "2007-02-13".to_string(),
|
||||||
corporate_start_date: "2010-01-01".to_string(),
|
corporate_start_date: "2010-01-01".to_string(),
|
||||||
economic_lookahead_months: 3,
|
economic_lookahead_months: 3,
|
||||||
max_parallel_tasks: default_max_parallel(),
|
max_parallel_instances: default_max_parallel_instances(),
|
||||||
|
max_tasks_per_instance: 0,
|
||||||
|
enable_vpn_rotation: false,
|
||||||
|
vpn_servers: String::new(),
|
||||||
|
tasks_per_vpn_session: default_tasks_per_session(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -59,19 +89,54 @@ impl Config {
|
|||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
.context("Failed to parse ECONOMIC_LOOKAHEAD_MONTHS as u32")?;
|
||||||
|
|
||||||
let max_parallel_tasks: usize = dotenvy::var("MAX_PARALLEL_TASKS")
|
let max_parallel_instances: usize = dotenvy::var("MAX_PARALLEL_INSTANCES")
|
||||||
.unwrap_or_else(|_| "10".to_string())
|
.unwrap_or_else(|_| "10".to_string())
|
||||||
.parse()
|
.parse()
|
||||||
.context("Failed to parse MAX_PARALLEL_TASKS as usize")?;
|
.context("Failed to parse MAX_PARALLEL_INSTANCES as usize")?;
|
||||||
|
|
||||||
|
let max_tasks_per_instance: usize = dotenvy::var("MAX_TASKS_PER_INSTANCE")
|
||||||
|
.unwrap_or_else(|_| "0".to_string())
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse MAX_TASKS_PER_INSTANCE as usize")?;
|
||||||
|
|
||||||
|
let enable_vpn_rotation = dotenvy::var("ENABLE_VPN_ROTATION")
|
||||||
|
.unwrap_or_else(|_| "false".to_string())
|
||||||
|
.parse::<bool>()
|
||||||
|
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
||||||
|
|
||||||
|
let vpn_servers = dotenvy::var("VPN_SERVERS")
|
||||||
|
.unwrap_or_else(|_| String::new());
|
||||||
|
|
||||||
|
let tasks_per_vpn_session: usize = dotenvy::var("TASKS_PER_VPN_SESSION")
|
||||||
|
.unwrap_or_else(|_| "0".to_string())
|
||||||
|
.parse()
|
||||||
|
.context("Failed to parse TASKS_PER_VPN_SESSION as usize")?;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
economic_start_date,
|
economic_start_date,
|
||||||
corporate_start_date,
|
corporate_start_date,
|
||||||
economic_lookahead_months,
|
economic_lookahead_months,
|
||||||
max_parallel_tasks,
|
max_parallel_instances,
|
||||||
|
max_tasks_per_instance,
|
||||||
|
enable_vpn_rotation,
|
||||||
|
vpn_servers,
|
||||||
|
tasks_per_vpn_session,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the list of VPN servers configured for rotation
|
||||||
|
pub fn get_vpn_servers(&self) -> Vec<String> {
|
||||||
|
if self.vpn_servers.is_empty() {
|
||||||
|
Vec::new()
|
||||||
|
} else {
|
||||||
|
self.vpn_servers
|
||||||
|
.split(',')
|
||||||
|
.map(|s| s.trim().to_string())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn target_end_date(&self) -> String {
|
pub fn target_end_date(&self) -> String {
|
||||||
let now = chrono::Local::now().naive_local().date();
|
let now = chrono::Local::now().naive_local().date();
|
||||||
let future = now + chrono::Duration::days(30 * self.economic_lookahead_months as i64);
|
let future = now + chrono::Duration::days(30 * self.economic_lookahead_months as i64);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
// src/corporate/aggregation.rs
|
// src/corporate/aggregation.rs
|
||||||
use super::types::CompanyPrice;
|
use super::types::CompanyPrice;
|
||||||
use super::storage::*;
|
use super::storage::*;
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
@@ -16,8 +17,8 @@ struct DayData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Aggregate price data from multiple exchanges, converting all to USD
|
/// Aggregate price data from multiple exchanges, converting all to USD
|
||||||
pub async fn aggregate_best_price_data(lei: &str) -> anyhow::Result<()> {
|
pub async fn aggregate_best_price_data(paths: &DataPaths, lei: &str) -> anyhow::Result<()> {
|
||||||
let company_dir = get_company_dir(lei);
|
let company_dir = get_company_dir(paths, lei);
|
||||||
|
|
||||||
for timeframe in ["daily", "5min"].iter() {
|
for timeframe in ["daily", "5min"].iter() {
|
||||||
let source_dir = company_dir.join(timeframe);
|
let source_dir = company_dir.join(timeframe);
|
||||||
|
|||||||
@@ -1,21 +1,24 @@
|
|||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
|
|
||||||
// src/corporate/openfigi.rs
|
// src/corporate/openfigi.rs
|
||||||
use super::{types::*};
|
use super::{types::*};
|
||||||
use reqwest::Client as HttpClient;
|
use reqwest::Client as HttpClient;
|
||||||
use reqwest::header::{HeaderMap, HeaderValue};
|
use reqwest::header::{HeaderMap, HeaderValue};
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
|
use csv::{ReaderBuilder, StringRecord, WriterBuilder};
|
||||||
|
use chrono::NaiveDate;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fs::{File, OpenOptions};
|
use std::path::{Path};
|
||||||
use std::io::{BufRead, BufReader, Write};
|
|
||||||
use std::path::Path;
|
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
use tokio::fs as tokio_fs;
|
use tokio::fs as tokio_fs;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
use anyhow::{Context, anyhow};
|
use anyhow::{Context, anyhow};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct OpenFigiClient {
|
pub struct OpenFigiClient {
|
||||||
client: HttpClient,
|
client: HttpClient,
|
||||||
api_key: Option<String>,
|
|
||||||
has_key: bool,
|
has_key: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -27,7 +30,7 @@ impl OpenFigiClient {
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
///
|
///
|
||||||
/// Returns an error if the HTTP client cannot be built or if the API key header is invalid.
|
/// Returns an error if the HTTP client cannot be built or if the API key header is invalid.
|
||||||
pub fn new() -> anyhow::Result<Self> {
|
pub async fn new() -> anyhow::Result<Self> {
|
||||||
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
|
let api_key = dotenvy::var("OPENFIGI_API_KEY").ok();
|
||||||
let has_key = api_key.is_some();
|
let has_key = api_key.is_some();
|
||||||
|
|
||||||
@@ -43,12 +46,13 @@ impl OpenFigiClient {
|
|||||||
|
|
||||||
let client = builder.build().context("Failed to build HTTP client")?;
|
let client = builder.build().context("Failed to build HTTP client")?;
|
||||||
|
|
||||||
println!(
|
let msg = format!(
|
||||||
"OpenFIGI client initialized: {}",
|
"OpenFIGI client initialized: {}",
|
||||||
if has_key { "with API key" } else { "no key (limited mode)" }
|
if has_key { "with API key" } else { "no key (limited mode)" }
|
||||||
);
|
);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
Ok(Self { client, api_key, has_key })
|
Ok(Self { client, has_key })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Maps a batch of ISINs to FigiInfo structs, filtering for equities only.
|
/// Maps a batch of ISINs to FigiInfo structs, filtering for equities only.
|
||||||
@@ -104,17 +108,43 @@ impl OpenFigiClient {
|
|||||||
.map(|isin| json!({
|
.map(|isin| json!({
|
||||||
"idType": "ID_ISIN",
|
"idType": "ID_ISIN",
|
||||||
"idValue": isin,
|
"idValue": isin,
|
||||||
"marketSecDes": "Equity",
|
//"marketSecDes": "Equity",
|
||||||
}))
|
}))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let resp = self.client
|
// Retry logic with exponential backoff for transient failures
|
||||||
|
let mut retry_count = 0;
|
||||||
|
let max_retries = 5;
|
||||||
|
let mut backoff_ms = 1000u64;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let resp_result = self.client
|
||||||
.post("https://api.openfigi.com/v3/mapping")
|
.post("https://api.openfigi.com/v3/mapping")
|
||||||
.header("Content-Type", "application/json")
|
.header("Content-Type", "application/json")
|
||||||
.json(&jobs)
|
.json(&jobs)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await;
|
||||||
.context("Failed to send mapping request")?;
|
|
||||||
|
let resp = match resp_result {
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
retry_count += 1;
|
||||||
|
if retry_count >= max_retries {
|
||||||
|
let err_msg = format!("Failed to send mapping request after {} retries: {}", max_retries, e);
|
||||||
|
logger::log_error(&err_msg).await;
|
||||||
|
return Err(anyhow!(err_msg));
|
||||||
|
}
|
||||||
|
let warn_msg = format!("Transient error sending mapping request (attempt {}/{}): {}", retry_count, max_retries, e);
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
let retry_msg = format!(" Retrying in {}ms...", backoff_ms);
|
||||||
|
println!("{}", retry_msg);
|
||||||
|
logger::log_info(&retry_msg).await;
|
||||||
|
sleep(Duration::from_millis(backoff_ms)).await;
|
||||||
|
backoff_ms = (backoff_ms * 2).min(60000); // Cap at 60s
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let status = resp.status();
|
let status = resp.status();
|
||||||
let headers = resp.headers().clone();
|
let headers = resp.headers().clone();
|
||||||
@@ -127,13 +157,29 @@ impl OpenFigiClient {
|
|||||||
.and_then(|v| v.to_str().ok())
|
.and_then(|v| v.to_str().ok())
|
||||||
.and_then(|s| s.parse::<u64>().ok())
|
.and_then(|s| s.parse::<u64>().ok())
|
||||||
.unwrap_or(10);
|
.unwrap_or(10);
|
||||||
println!("Rate limited—backing off {}s", reset_sec);
|
let rate_msg = format!("Rate limited—backing off {}s", reset_sec);
|
||||||
|
println!("{}", rate_msg);
|
||||||
|
logger::log_warn(&rate_msg).await;
|
||||||
sleep(Duration::from_secs(reset_sec.max(10))).await;
|
sleep(Duration::from_secs(reset_sec.max(10))).await;
|
||||||
continue; // Retry the same chunk
|
continue; // Retry the same chunk
|
||||||
} else if status == 401 {
|
} else if status == 401 {
|
||||||
return Err(anyhow!("Invalid OpenFIGI API key: {}", body));
|
return Err(anyhow!("Invalid OpenFIGI API key: {}", body));
|
||||||
} else if status == 413 {
|
} else if status == 413 {
|
||||||
return Err(anyhow!("Payload too large—reduce chunk size: {}", body));
|
return Err(anyhow!("Payload too large—reduce chunk size: {}", body));
|
||||||
|
} else if status.is_server_error() {
|
||||||
|
// Transient server error, retry with backoff
|
||||||
|
retry_count += 1;
|
||||||
|
if retry_count >= max_retries {
|
||||||
|
let err_msg = format!("OpenFIGI server error {} after {} retries: {}", status, max_retries, body);
|
||||||
|
logger::log_error(&err_msg).await;
|
||||||
|
return Err(anyhow!(err_msg));
|
||||||
|
}
|
||||||
|
let warn_msg = format!("Server error {} (attempt {}/{}), retrying in {}ms...", status, retry_count, max_retries, backoff_ms);
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
sleep(Duration::from_millis(backoff_ms)).await;
|
||||||
|
backoff_ms = (backoff_ms * 2).min(60000);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
return Err(anyhow!("OpenFIGI error {}: {}", status, body));
|
return Err(anyhow!("OpenFIGI error {}: {}", status, body));
|
||||||
}
|
}
|
||||||
@@ -146,9 +192,8 @@ impl OpenFigiClient {
|
|||||||
for item in data {
|
for item in data {
|
||||||
let sec_type = item["securityType"].as_str().unwrap_or("");
|
let sec_type = item["securityType"].as_str().unwrap_or("");
|
||||||
let market_sec = item["marketSector"].as_str().unwrap_or("");
|
let market_sec = item["marketSector"].as_str().unwrap_or("");
|
||||||
if market_sec == "Equity" &&
|
|
||||||
(sec_type.contains("Stock") || sec_type.contains("Share") || sec_type.contains("Equity") ||
|
// Capture all security types, let caller filter by market sector if needed
|
||||||
sec_type.contains("Common") || sec_type.contains("Preferred") || sec_type == "ADR" || sec_type == "GDR") {
|
|
||||||
let figi = match item["figi"].as_str() {
|
let figi = match item["figi"].as_str() {
|
||||||
Some(f) => f.to_string(),
|
Some(f) => f.to_string(),
|
||||||
None => continue,
|
None => continue,
|
||||||
@@ -159,20 +204,22 @@ impl OpenFigiClient {
|
|||||||
figi,
|
figi,
|
||||||
name: item["name"].as_str().unwrap_or("").to_string(),
|
name: item["name"].as_str().unwrap_or("").to_string(),
|
||||||
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
|
ticker: item["ticker"].as_str().unwrap_or("").to_string(),
|
||||||
mic_code: item["exchCode"].as_str().unwrap_or("").to_string(),
|
exch_code: item["exchCode"].as_str().unwrap_or("").to_string(),
|
||||||
currency: item["currency"].as_str().unwrap_or("").to_string(),
|
composite_figi: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
|
||||||
compositeFIGI: item["compositeFIGI"].as_str().unwrap_or("").to_string(),
|
security_type: sec_type.to_string(),
|
||||||
securityType: sec_type.to_string(),
|
market_sector: market_sec.to_string(),
|
||||||
marketSector: market_sec.to_string(),
|
share_class_figi: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
|
||||||
shareClassFIGI: item["shareClassFIGI"].as_str().unwrap_or("").to_string(),
|
security_type2: item["securityType2"].as_str().unwrap_or("").to_string(),
|
||||||
securityType2: item["securityType2"].as_str().unwrap_or("").to_string(),
|
security_description: item["securityDescription"].as_str().unwrap_or("").to_string(),
|
||||||
securityDescription: item["securityDescription"].as_str().unwrap_or("").to_string(),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
all_figi_infos.push(figi_info);
|
all_figi_infos.push(figi_info);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Successfully processed this chunk, break out of retry loop
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
req_count += 1;
|
req_count += 1;
|
||||||
@@ -195,32 +242,158 @@ impl OpenFigiClient {
|
|||||||
|
|
||||||
Ok(all_figi_infos)
|
Ok(all_figi_infos)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Checks if the client has an API key configured.
|
|
||||||
pub fn has_key(&self) -> bool {
|
|
||||||
self.has_key
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a reference to the underlying HTTP client.
|
|
||||||
pub fn get_figi_client(&self) -> &HttpClient {
|
|
||||||
&self.client
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Builds a LEI-to-FigiInfo map from the LEI-ISIN mapping, filtering for equities via OpenFIGI.
|
/// Extracts the date from a GLEIF CSV filename in the format "isin-lei-DDMMYYYY.csv".
|
||||||
///
|
///
|
||||||
/// Attempts to load existing entries from "data/corporate/by_lei/lei_to_figi.jsonl" (JSON Lines format,
|
/// # Arguments
|
||||||
/// one LEI entry per line: {"lei": "ABC", "figis": [FigiInfo...]}). For any missing LEIs (compared to
|
|
||||||
/// `lei_to_isins`), fetches their FigiInfos and appends to the .jsonl file incrementally.
|
|
||||||
///
|
///
|
||||||
/// This design allows resumption after interruptions: on restart, already processed LEIs are skipped,
|
/// * `filename` - The GLEIF CSV filename (e.g., "isin-lei-24112025.csv")
|
||||||
/// and only remaining ones are fetched. Processes LEIs in sorted order for deterministic behavior.
|
|
||||||
///
|
///
|
||||||
/// If no API key is present, skips building new entries and returns the loaded map (possibly partial).
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A string in the format "DDMMYYYY" (e.g., "24112025") if successfully parsed, otherwise the original filename.
|
||||||
|
fn extract_gleif_date_from_filename(filename: &str) -> String {
|
||||||
|
// Pattern: isin-lei-DDMMYYYY.csv
|
||||||
|
if let Some(start_idx) = filename.find("isin-lei-") {
|
||||||
|
let rest = &filename[start_idx + 9..]; // Skip "isin-lei-"
|
||||||
|
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
|
||||||
|
return rest[0..8].to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
filename.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Loads the list of market sectors from cache/openfigi/marketSecDes.json
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// Vec of market sector strings (e.g., ["Comdty", "Corp", "Curncy", "Equity", ...])
|
||||||
|
/// If the file doesn't exist or can't be parsed, returns a sensible default list.
|
||||||
|
async fn load_market_sectors() -> anyhow::Result<Vec<String>> {
|
||||||
|
let dir = DataPaths::new(".")?;
|
||||||
|
let cache_file = dir.cache_openfigi_dir().join("marketSecDes.json");
|
||||||
|
|
||||||
|
if !cache_file.exists() {
|
||||||
|
// Return default if file doesn't exist
|
||||||
|
let warn_msg = format!("Warning: {} not found, using default sectors", cache_file.display());
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
return Ok(vec![
|
||||||
|
"Comdty".to_string(),
|
||||||
|
"Corp".to_string(),
|
||||||
|
"Curncy".to_string(),
|
||||||
|
"Equity".to_string(),
|
||||||
|
"Govt".to_string(),
|
||||||
|
"Index".to_string(),
|
||||||
|
"M-Mkt".to_string(),
|
||||||
|
"Mtge".to_string(),
|
||||||
|
"Muni".to_string(),
|
||||||
|
"Pfd".to_string(),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = tokio_fs::read_to_string(&cache_file).await
|
||||||
|
.context("Failed to read marketSecDes.json")?;
|
||||||
|
|
||||||
|
let json: Value = serde_json::from_str(&content)
|
||||||
|
.context("Failed to parse marketSecDes.json")?;
|
||||||
|
|
||||||
|
let sectors: Vec<String> = json["values"]
|
||||||
|
.as_array()
|
||||||
|
.ok_or_else(|| anyhow!("'values' field not found in marketSecDes.json"))?
|
||||||
|
.iter()
|
||||||
|
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if sectors.is_empty() {
|
||||||
|
return Err(anyhow!("No sectors found in marketSecDes.json"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let msg = format!("Loaded {} market sectors from cache", sectors.len());
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
Ok(sectors)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finds the most recent GLEIF CSV file in the cache/gleif directory.
|
||||||
|
///
|
||||||
|
/// Returns the extracted date in format "DDMMYYYY" from the filename.
|
||||||
|
/// If no GLEIF file is found, returns None.
|
||||||
|
async fn find_most_recent_gleif_date(gleif_cache_dir: &Path) -> anyhow::Result<Option<String>> {
|
||||||
|
// First check for subdirectories named as DDMMYYYY and pick the most recent date
|
||||||
|
let mut dir_entries = tokio_fs::read_dir(gleif_cache_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to read gleif cache directory")?;
|
||||||
|
|
||||||
|
let mut found_dates: Vec<NaiveDate> = Vec::new();
|
||||||
|
|
||||||
|
while let Some(entry) = dir_entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_dir() {
|
||||||
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||||
|
// Expect folder name in DDMMYYYY
|
||||||
|
if name.len() == 8 && name.chars().all(|c| c.is_numeric()) {
|
||||||
|
if let Ok(nd) = NaiveDate::parse_from_str(name, "%d%m%Y") {
|
||||||
|
found_dates.push(nd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !found_dates.is_empty() {
|
||||||
|
found_dates.sort();
|
||||||
|
if let Some(most_recent) = found_dates.last() {
|
||||||
|
let date_str = most_recent.format("%d%m%Y").to_string();
|
||||||
|
let msg = format!(" Found GLEIF data dated (from subdirs): {}", date_str);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
return Ok(Some(date_str));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: look for CSV files in the directory as before
|
||||||
|
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to read gleif cache directory")?;
|
||||||
|
let mut csv_files = Vec::new();
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if let Some(filename) = path.file_name() {
|
||||||
|
let filename_str = filename.to_string_lossy();
|
||||||
|
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
|
||||||
|
csv_files.push(filename_str.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if csv_files.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort files in reverse order (most recent first) based on date in filename
|
||||||
|
csv_files.sort();
|
||||||
|
csv_files.reverse();
|
||||||
|
|
||||||
|
let most_recent = &csv_files[0];
|
||||||
|
let date = extract_gleif_date_from_filename(most_recent);
|
||||||
|
|
||||||
|
let msg = format!(" Found GLEIF data dated: {}", date);
|
||||||
|
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
Ok(Some(date))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Builds a LEI-to-FigiInfo map with automatic retry on transient failures.
|
||||||
|
///
|
||||||
|
/// This is a wrapper around build_lei_to_figi_infos_internal that handles transient errors
|
||||||
|
/// by automatically retrying after a delay if the mapping process fails. The mapping can
|
||||||
|
/// resume from where it left off since already-processed LEIs are saved incrementally.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
///
|
///
|
||||||
/// * `lei_to_isins` - HashMap of LEI to Vec<ISIN> (used for fetching missing entries).
|
/// * `lei_to_isins` - HashMap of LEI to Vec<ISIN> (used for fetching missing entries).
|
||||||
|
/// * `gleif_date` - Optional date in format "DDMMYYYY". If None, uses the most recent GLEIF file.
|
||||||
///
|
///
|
||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
@@ -228,31 +401,159 @@ impl OpenFigiClient {
|
|||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
///
|
///
|
||||||
/// Returns an error if file I/O fails, JSON serialization/deserialization fails,
|
/// Returns an error only on fatal errors (file I/O, invalid API key, etc.).
|
||||||
/// or if OpenFIGI queries fail during fetching.
|
/// Transient errors are retried automatically.
|
||||||
pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
|
pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>, gleif_date: Option<&str>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
|
||||||
let data_dir = Path::new("data/corporate/by_lei");
|
let mut retry_count = 0;
|
||||||
tokio_fs::create_dir_all(data_dir).await.context("Failed to create data directory")?;
|
let max_retries = 3;
|
||||||
|
|
||||||
let path = data_dir.join("lei_to_figi.jsonl");
|
loop {
|
||||||
let mut lei_to_figis: HashMap<String, Vec<FigiInfo>> = load_lei_to_figi_jsonl(&path)?;
|
match build_lei_to_figi_infos_internal(lei_to_isins, gleif_date).await {
|
||||||
|
Ok(map) => {
|
||||||
|
if !map.is_empty() {
|
||||||
|
let msg = format!("✓ LEI→FIGI mapping completed successfully with {} entries", map.len());
|
||||||
|
|
||||||
let client = OpenFigiClient::new()?;
|
logger::log_info(&msg).await;
|
||||||
|
}
|
||||||
|
return Ok(map);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let error_msg = e.to_string();
|
||||||
|
|
||||||
|
// Check if this is a fatal error or transient
|
||||||
|
let is_fatal = error_msg.contains("Invalid OpenFIGI API key")
|
||||||
|
|| error_msg.contains("No GLEIF CSV file found")
|
||||||
|
|| error_msg.contains("Failed to create");
|
||||||
|
|
||||||
|
if is_fatal {
|
||||||
|
let err = format!("Fatal error in LEI→FIGI mapping: {}", e);
|
||||||
|
eprintln!("{}", err);
|
||||||
|
logger::log_error(&err).await;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
retry_count += 1;
|
||||||
|
if retry_count >= max_retries {
|
||||||
|
let err = format!("LEI→FIGI mapping failed after {} retries: {}", max_retries, e);
|
||||||
|
eprintln!("{}", err);
|
||||||
|
logger::log_error(&err).await;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let wait_secs = 60 * retry_count;
|
||||||
|
let warn_msg = format!("Transient error in LEI→FIGI mapping (attempt {}/{}): {}", retry_count, max_retries, e);
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
let retry_msg = format!("Retrying mapping in {}s...", wait_secs);
|
||||||
|
println!("{}", retry_msg);
|
||||||
|
logger::log_info(&retry_msg).await;
|
||||||
|
sleep(Duration::from_secs(wait_secs as u64)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal implementation of LEI-to-FigiInfo mapping.
|
||||||
|
///
|
||||||
|
/// This is the actual worker function that performs the mapping. It handles already-processed
|
||||||
|
/// LEIs gracefully but will fail on transient errors, which are caught and retried by the
|
||||||
|
/// wrapper function build_lei_to_figi_infos.
|
||||||
|
///
|
||||||
|
/// Tracks three outcomes:
|
||||||
|
/// 1. Hit with marketSector: saved to sector-specific folder
|
||||||
|
/// 2. Hit without marketSector: saved to "uncategorized" folder
|
||||||
|
/// 3. No_hit (empty results): LEI marked for removal from GLEIF CSV
|
||||||
|
async fn build_lei_to_figi_infos_internal(lei_to_isins: &HashMap<String, Vec<String>>, gleif_date: Option<&str>) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
|
||||||
|
let dir = DataPaths::new(".")?;
|
||||||
|
let gleif_cache_dir = dir.cache_gleif_dir();
|
||||||
|
let map_cache_dir = dir.cache_gleif_openfigi_map_dir();
|
||||||
|
|
||||||
|
// Determine the GLEIF date to use
|
||||||
|
let date = if let Some(d) = gleif_date {
|
||||||
|
let msg = format!("Using provided GLEIF date: {}", d);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
d.to_string()
|
||||||
|
} else {
|
||||||
|
// Find the most recent GLEIF file
|
||||||
|
logger::log_info("Searching for most recent GLEIF file...").await;
|
||||||
|
match find_most_recent_gleif_date(&gleif_cache_dir).await? {
|
||||||
|
Some(d) => d,
|
||||||
|
None => {
|
||||||
|
let err = "No GLEIF CSV file found in cache/gleif directory";
|
||||||
|
logger::log_error(err).await;
|
||||||
|
return Err(anyhow!(err));
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Creat date-based subdirectory in the gleif cache
|
||||||
|
let gleif_date_dir = gleif_cache_dir.join(&date);
|
||||||
|
|
||||||
|
// Create date-based subdirectory in the mapping cache
|
||||||
|
let msg = format!("Creating date directory for: {}", date);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
let date_dir = map_cache_dir.join(&date);
|
||||||
|
tokio_fs::create_dir_all(&date_dir).await.context("Failed to create date directory")?;
|
||||||
|
|
||||||
|
// Load market sectors dynamically from cache
|
||||||
|
logger::log_info("Loading market sectors...").await;
|
||||||
|
let sector_dirs = load_market_sectors().await?;
|
||||||
|
let mut sector_maps: HashMap<String, HashMap<String, Vec<FigiInfo>>> = HashMap::new();
|
||||||
|
|
||||||
|
// Create uncategorized folder
|
||||||
|
let msg = format!("Creating {} sector directories...", sector_dirs.len());
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
let uncategorized_dir = date_dir.join("uncategorized");
|
||||||
|
tokio_fs::create_dir_all(&uncategorized_dir).await.context("Failed to create uncategorized directory")?;
|
||||||
|
let uncategorized_path = uncategorized_dir.join("lei_to_figi.jsonl");
|
||||||
|
let uncategorized_map = load_lei_to_figi_jsonl(&uncategorized_path).await?;
|
||||||
|
sector_maps.insert("uncategorized".to_string(), uncategorized_map);
|
||||||
|
|
||||||
|
for sector in §or_dirs {
|
||||||
|
let sector_dir = date_dir.join(sector);
|
||||||
|
tokio_fs::create_dir_all(§or_dir).await.context("Failed to create sector directory")?;
|
||||||
|
|
||||||
|
// Load existing mappings for this sector
|
||||||
|
let path = sector_dir.join("lei_to_figi.jsonl");
|
||||||
|
let lei_map = load_lei_to_figi_jsonl(&path).await?;
|
||||||
|
sector_maps.insert(sector.clone(), lei_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
let client = OpenFigiClient::new().await?;
|
||||||
if !client.has_key {
|
if !client.has_key {
|
||||||
println!("No API key—using partial LEI→FIGI map with {} entries", lei_to_figis.len());
|
let total_entries: usize = sector_maps.values().map(|m| m.len()).sum();
|
||||||
return Ok(lei_to_figis);
|
let msg = format!("No API key—using partial LEI→FIGI maps with {} total entries", total_entries);
|
||||||
|
|
||||||
|
logger::log_warn(&msg).await;
|
||||||
|
return Ok(sector_maps.get("Equity").cloned().unwrap_or_default());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort LEIs for deterministic processing order
|
// Sort LEIs for deterministic processing order
|
||||||
|
logger::log_info("Starting LEI→FIGI mapping process...").await;
|
||||||
let mut leis: Vec<_> = lei_to_isins.keys().cloned().collect();
|
let mut leis: Vec<_> = lei_to_isins.keys().cloned().collect();
|
||||||
leis.sort();
|
leis.sort();
|
||||||
|
|
||||||
let mut processed = lei_to_figis.len();
|
let mut processed = sector_maps.values().map(|m| m.len()).sum::<usize>();
|
||||||
let total = leis.len();
|
let total = leis.len();
|
||||||
|
let mut no_hit_leis = Vec::new(); // Track LEIs with no data found (no_hit)
|
||||||
|
let mut leis_to_delete_batch = Vec::new(); // Batch delete every 100 LEIs
|
||||||
|
|
||||||
|
let msg = format!("Total LEIs to process: {}, already processed: {}", total, processed);
|
||||||
|
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
for lei in leis {
|
for lei in leis {
|
||||||
if lei_to_figis.contains_key(&lei) {
|
// Check if LEI is already processed in any sector (including uncategorized)
|
||||||
continue; // Skip already processed
|
let mut already_processed = false;
|
||||||
|
for sector_map in sector_maps.values() {
|
||||||
|
if sector_map.contains_key(&lei) {
|
||||||
|
already_processed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if already_processed {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let isins = match lei_to_isins.get(&lei) {
|
let isins = match lei_to_isins.get(&lei) {
|
||||||
@@ -261,30 +562,117 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
|
|||||||
};
|
};
|
||||||
|
|
||||||
let unique_isins: Vec<_> = isins.iter().cloned().collect::<HashSet<_>>().into_iter().collect();
|
let unique_isins: Vec<_> = isins.iter().cloned().collect::<HashSet<_>>().into_iter().collect();
|
||||||
let equity_figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
|
let debug_msg = format!("Processing LEI {} with {} ISINs...", lei, unique_isins.len());
|
||||||
|
logger::log_info(&debug_msg).await;
|
||||||
|
|
||||||
let mut figis = equity_figi_infos;
|
let all_figi_infos = client.map_isins_to_figi_infos(&unique_isins).await?;
|
||||||
|
|
||||||
|
// Case 1: no_hit - API succeeded but returned no data
|
||||||
|
if all_figi_infos.is_empty() {
|
||||||
|
let no_hit_msg = format!(" no_hit: LEI {} returned no FIGIs", lei);
|
||||||
|
logger::log_warn(&no_hit_msg).await;
|
||||||
|
no_hit_leis.push(lei.clone());
|
||||||
|
leis_to_delete_batch.push(lei.clone());
|
||||||
|
|
||||||
|
// Delete every 100 no_hit LEIs to prevent progress loss on interrupt
|
||||||
|
if leis_to_delete_batch.len() >= 100 {
|
||||||
|
let batch_msg = format!("Batch deleting {} LEIs from GLEIF CSV...", leis_to_delete_batch.len());
|
||||||
|
logger::log_info(&batch_msg).await;
|
||||||
|
if let Err(e) = remove_leis_batch_from_gleif_csv(&gleif_date_dir, &leis_to_delete_batch).await {
|
||||||
|
let warn_msg = format!("Warning: Failed to batch remove LEIs from GLEIF CSV: {}", e);
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
}
|
||||||
|
leis_to_delete_batch.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let hit_msg = format!(" hit: LEI {} found {} FIGIs", lei, all_figi_infos.len());
|
||||||
|
logger::log_info(&hit_msg).await;
|
||||||
|
|
||||||
|
// Organize results by marketSector
|
||||||
|
let mut figis_by_sector: HashMap<String, Vec<FigiInfo>> = HashMap::new();
|
||||||
|
let mut uncategorized_figis = Vec::new();
|
||||||
|
|
||||||
|
for figi_info in all_figi_infos {
|
||||||
|
let sector = figi_info.market_sector.clone();
|
||||||
|
|
||||||
|
if sector.is_empty() {
|
||||||
|
// Case 2: Hit but no marketSecDes - save to uncategorized
|
||||||
|
uncategorized_figis.push(figi_info);
|
||||||
|
} else {
|
||||||
|
// Case 1: Hit with marketSector - organize by sector
|
||||||
|
figis_by_sector.entry(sector).or_insert_with(Vec::new).push(figi_info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save uncategorized FIGIs if any
|
||||||
|
if !uncategorized_figis.is_empty() {
|
||||||
|
uncategorized_figis.sort_by_key(|f| f.figi.clone());
|
||||||
|
uncategorized_figis.dedup_by_key(|f| f.figi.clone());
|
||||||
|
|
||||||
|
append_lei_to_figi_jsonl(&uncategorized_path, &lei, &uncategorized_figis).await
|
||||||
|
.context("Failed to append to uncategorized JSONL")?;
|
||||||
|
|
||||||
|
if let Some(uncategorized_map) = sector_maps.get_mut("uncategorized") {
|
||||||
|
uncategorized_map.insert(lei.clone(), uncategorized_figis);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save to appropriate sector files
|
||||||
|
for (sector, mut figis) in figis_by_sector {
|
||||||
if !figis.is_empty() {
|
if !figis.is_empty() {
|
||||||
figis.sort_by_key(|f| f.figi.clone());
|
figis.sort_by_key(|f| f.figi.clone());
|
||||||
figis.dedup_by_key(|f| f.figi.clone());
|
figis.dedup_by_key(|f| f.figi.clone());
|
||||||
|
|
||||||
|
// Save to sector's JSONL file
|
||||||
|
let sector_dir = date_dir.join(§or);
|
||||||
|
let path = sector_dir.join("lei_to_figi.jsonl");
|
||||||
|
append_lei_to_figi_jsonl(&path, &lei, &figis).await.context("Failed to append to JSONL")?;
|
||||||
|
|
||||||
|
// Update in-memory sector map
|
||||||
|
if let Some(sector_map) = sector_maps.get_mut(§or) {
|
||||||
|
sector_map.insert(lei.clone(), figis);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Append to .jsonl incrementally
|
|
||||||
append_lei_to_figi_jsonl(&path, &lei, &figis).context("Failed to append to JSONL")?;
|
|
||||||
|
|
||||||
// Insert into in-memory map
|
|
||||||
lei_to_figis.insert(lei.clone(), figis);
|
|
||||||
|
|
||||||
processed += 1;
|
processed += 1;
|
||||||
if processed % 100 == 0 {
|
if processed % 100 == 0 {
|
||||||
println!("Processed {}/{} LEIs → {} total equity FIGIs", processed, total, lei_to_figis.values().map(|v| v.len()).sum::<usize>());
|
let totals: Vec<String> = sector_dirs.iter().map(|s| {
|
||||||
|
let count = sector_maps.get(s).map(|m| m.len()).unwrap_or(0);
|
||||||
|
format!("{}:{}", s, count)
|
||||||
|
}).collect();
|
||||||
|
let progress_msg = format!("Processed {}/{} LEIs → [{}] no_hit: {}", processed, total, totals.join(", "), no_hit_leis.len());
|
||||||
|
println!("{}", progress_msg);
|
||||||
|
logger::log_info(&progress_msg).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("Completed LEI→FIGI map: {} mappings (equity-only)", lei_to_figis.len());
|
// Delete any remaining LEIs in the batch
|
||||||
Ok(lei_to_figis)
|
if !leis_to_delete_batch.is_empty() {
|
||||||
|
let batch_msg = format!("Final batch: Deleting {} LEIs from GLEIF CSV...", leis_to_delete_batch.len());
|
||||||
|
logger::log_info(&batch_msg).await;
|
||||||
|
if let Err(e) = remove_leis_batch_from_gleif_csv(gleif_cache_dir, &leis_to_delete_batch).await {
|
||||||
|
let warn_msg = format!("Warning: Failed to delete final batch from GLEIF CSV: {}", e);
|
||||||
|
eprintln!("{}", warn_msg);
|
||||||
|
logger::log_warn(&warn_msg).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log final summary for no_hit LEIs (they've already been removed incrementally)
|
||||||
|
if !no_hit_leis.is_empty() {
|
||||||
|
let no_hit_summary = format!("no_hit (removed in batches from GLEIF): {} LEIs", no_hit_leis.len());
|
||||||
|
println!("{}", no_hit_summary);
|
||||||
|
logger::log_info(&no_hit_summary).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return Equity sector as the main result
|
||||||
|
Ok(sector_maps.get("Equity").cloned().unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Loads LEI-to-FigiInfo map from a JSON Lines file.
|
/// Loads LEI-to-FigiInfo map from a JSON Lines file.
|
||||||
@@ -302,18 +690,16 @@ pub async fn build_lei_to_figi_infos(lei_to_isins: &HashMap<String, Vec<String>>
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
///
|
///
|
||||||
/// Returns an error if the file cannot be opened or if any line fails to parse as JSON.
|
/// Returns an error if the file cannot be opened or if any line fails to parse as JSON.
|
||||||
fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
|
async fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, Vec<FigiInfo>>> {
|
||||||
let mut map = HashMap::new();
|
let mut map = HashMap::new();
|
||||||
|
|
||||||
if !path.exists() {
|
if !path.exists() {
|
||||||
return Ok(map);
|
return Ok(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
let file = File::open(path).context("Failed to open JSONL file for reading")?;
|
let content = tokio_fs::read_to_string(path).await.context("Failed to read JSONL file")?;
|
||||||
let reader = BufReader::new(file);
|
|
||||||
|
|
||||||
for (line_num, line) in reader.lines().enumerate() {
|
for (line_num, line) in content.lines().enumerate() {
|
||||||
let line = line.context(format!("Failed to read line {}", line_num + 1))?;
|
|
||||||
if line.trim().is_empty() {
|
if line.trim().is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -325,7 +711,9 @@ fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, Vec<Fig
|
|||||||
map.insert(lei, figis);
|
map.insert(lei, figis);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("Loaded LEI→FIGI map with {} entries from {}", map.len(), path.display());
|
let msg = format!("Loaded LEI→FIGI map with {} entries from {}", map.len(), path.display());
|
||||||
|
|
||||||
|
logger::log_info(&msg).await;
|
||||||
Ok(map)
|
Ok(map)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -340,20 +728,207 @@ fn load_lei_to_figi_jsonl(path: &Path) -> anyhow::Result<HashMap<String, Vec<Fig
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
///
|
///
|
||||||
/// Returns an error if the file cannot be opened for append or if serialization fails.
|
/// Returns an error if the file cannot be opened for append or if serialization fails.
|
||||||
fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> {
|
async fn append_lei_to_figi_jsonl(path: &Path, lei: &str, figis: &[FigiInfo]) -> anyhow::Result<()> {
|
||||||
let mut file = OpenOptions::new()
|
|
||||||
.create(true)
|
|
||||||
.append(true)
|
|
||||||
.open(path)
|
|
||||||
.context("Failed to open JSONL file for append")?;
|
|
||||||
|
|
||||||
let entry = json!({
|
let entry = json!({
|
||||||
"lei": lei,
|
"lei": lei,
|
||||||
"figis": figis,
|
"figis": figis,
|
||||||
});
|
});
|
||||||
|
|
||||||
let line = serde_json::to_string(&entry).context("Failed to serialize entry")? + "\n";
|
let line = serde_json::to_string(&entry).context("Failed to serialize entry")? + "\n";
|
||||||
file.write_all(line.as_bytes()).context("Failed to write to JSONL file")?;
|
|
||||||
|
let mut file = tokio_fs::OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(path)
|
||||||
|
.await
|
||||||
|
.context("Failed to open JSONL file for append")?;
|
||||||
|
|
||||||
|
file.write_all(line.as_bytes())
|
||||||
|
.await
|
||||||
|
.context("Failed to write to JSONL file")?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Removes multiple invalid LEIs from the GLEIF CSV file in a single batch operation.
|
||||||
|
///
|
||||||
|
/// This function is more efficient than removing LEIs one at a time.
|
||||||
|
/// It reads the GLEIF CSV once, filters out all specified LEIs, and overwrites the file once.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `gleif_cache_dir` - Path to the cache/gleif directory
|
||||||
|
/// * `leis_to_remove` - Vec of LEI strings to remove
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) if successful, Err if file operations fail.
|
||||||
|
async fn remove_leis_batch_from_gleif_csv(gleif_cache_dir: &Path, leis_to_remove: &[String]) -> anyhow::Result<()> {
|
||||||
|
if leis_to_remove.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the most recent GLEIF CSV file
|
||||||
|
let mut entries = tokio_fs::read_dir(gleif_cache_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to read gleif cache directory")?;
|
||||||
|
|
||||||
|
let mut csv_files = Vec::new();
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if let Some(filename) = path.file_name() {
|
||||||
|
let filename_str = filename.to_string_lossy();
|
||||||
|
if filename_str.ends_with(".csv") && filename_str.contains("isin-lei-") {
|
||||||
|
csv_files.push(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if csv_files.is_empty() {
|
||||||
|
logger::log_warn("No GLEIF CSV files found for batch removal operation").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer an original (non-_clean) GLEIF CSV if available; otherwise use the most recent file.
|
||||||
|
csv_files.sort();
|
||||||
|
csv_files.reverse();
|
||||||
|
|
||||||
|
let mut gleif_file: &std::path::PathBuf = &csv_files[0];
|
||||||
|
// Try to find the most recent filename that does NOT end with "_clean.csv"
|
||||||
|
if let Some(non_clean) = csv_files.iter().find(|p| {
|
||||||
|
p.file_name()
|
||||||
|
.and_then(|n| n.to_str())
|
||||||
|
.map(|s| !s.to_lowercase().ends_with("_clean.csv"))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}) {
|
||||||
|
gleif_file = non_clean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare clean file path: insert "_clean" before extension
|
||||||
|
let orig_path = gleif_file;
|
||||||
|
let file_name = orig_path.file_name().and_then(|n| n.to_str()).unwrap_or("gleif.csv");
|
||||||
|
let mut stem = orig_path.file_stem().and_then(|s| s.to_str()).unwrap_or("isin-lei").to_string();
|
||||||
|
let parent = orig_path.parent().unwrap_or_else(|| Path::new("."));
|
||||||
|
// Avoid creating a double "_clean_clean.csv". If stem already ends with "_clean", keep it.
|
||||||
|
if stem.to_lowercase().ends_with("_clean") {
|
||||||
|
// stem is already clean; keep same filename (no double suffix)
|
||||||
|
// e.g., stem="isin-lei-24112025_clean" -> clean_name="isin-lei-24112025_clean.csv"
|
||||||
|
} else {
|
||||||
|
stem = format!("{}_clean", stem);
|
||||||
|
}
|
||||||
|
|
||||||
|
let clean_name = format!("{}.csv", stem);
|
||||||
|
let clean_path = parent.join(&clean_name);
|
||||||
|
|
||||||
|
// If a clean file already exists, operate on it; otherwise read original and write clean file
|
||||||
|
let source_path = if clean_path.exists() { &clean_path } else { orig_path };
|
||||||
|
|
||||||
|
let debug_msg = format!("Reading GLEIF source for batch removal: {} (writing to {})", source_path.display(), clean_path.display());
|
||||||
|
logger::log_info(&debug_msg).await;
|
||||||
|
|
||||||
|
// Cleanup any accidental double-clean files in the same directory: if a file ends with
|
||||||
|
// "_clean_clean.csv" replace it with single "_clean.csv" or remove it if target exists.
|
||||||
|
if let Ok(mut dir_entries) = tokio_fs::read_dir(parent).await {
|
||||||
|
while let Ok(Some(entry)) = dir_entries.next_entry().await {
|
||||||
|
if let Some(name) = entry.file_name().to_str().map(|s| s.to_string()) {
|
||||||
|
if name.to_lowercase().ends_with("_clean_clean.csv") {
|
||||||
|
let offending = entry.path();
|
||||||
|
let candidate = offending.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||||
|
let target_name = candidate.replacen("_clean_clean.csv", "_clean.csv", 1);
|
||||||
|
let target_path = parent.join(target_name);
|
||||||
|
|
||||||
|
if !target_path.exists() {
|
||||||
|
// Rename offending -> target
|
||||||
|
let _ = tokio_fs::rename(&offending, &target_path).await;
|
||||||
|
let msg = format!("Renamed {} -> {}", offending.display(), target_path.display());
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
} else {
|
||||||
|
// Target exists already; remove offending duplicate
|
||||||
|
let _ = tokio_fs::remove_file(&offending).await;
|
||||||
|
let msg = format!("Removed duplicate {}", offending.display());
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read file into memory and parse with csv crate for robust handling of quoted fields
|
||||||
|
let content = tokio_fs::read_to_string(source_path)
|
||||||
|
.await
|
||||||
|
.context("Failed to read GLEIF CSV source")?;
|
||||||
|
|
||||||
|
// Convert LEIs to remove into a HashSet (normalized)
|
||||||
|
let remove_set: std::collections::HashSet<String> = leis_to_remove
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.trim().trim_matches('"').to_uppercase())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Build CSV reader: try with headers first; allow flexible records
|
||||||
|
let mut reader = ReaderBuilder::new()
|
||||||
|
.has_headers(true)
|
||||||
|
.flexible(true)
|
||||||
|
.from_reader(content.as_bytes());
|
||||||
|
|
||||||
|
// Remember headers (if present) and then iterate records.
|
||||||
|
let headers_record = match reader.headers() {
|
||||||
|
Ok(h) => Some(h.clone()),
|
||||||
|
Err(_) => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
// We'll collect kept records and count original rows
|
||||||
|
let mut kept_records: Vec<StringRecord> = Vec::new();
|
||||||
|
let mut original_count: usize = 0;
|
||||||
|
let mut removed_count: usize = 0;
|
||||||
|
|
||||||
|
// For robustness, search all columns for a matching LEI instead of relying on a single column index.
|
||||||
|
for result in reader.records() {
|
||||||
|
let record = result.context("Failed to parse CSV record")?;
|
||||||
|
original_count += 1;
|
||||||
|
|
||||||
|
// Check every field for a match in the remove set
|
||||||
|
let mut matched = false;
|
||||||
|
for field in record.iter() {
|
||||||
|
let norm = field.trim().trim_matches('"').to_uppercase();
|
||||||
|
if remove_set.contains(&norm) {
|
||||||
|
matched = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if matched {
|
||||||
|
removed_count += 1;
|
||||||
|
} else {
|
||||||
|
kept_records.push(record.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_count = kept_records.len();
|
||||||
|
|
||||||
|
// Write back using csv writer to preserve quoting/format into clean file
|
||||||
|
let mut wtr = WriterBuilder::new().has_headers(true).from_writer(vec![]);
|
||||||
|
// If original had headers, write them back
|
||||||
|
if let Some(headers) = headers_record {
|
||||||
|
wtr.write_record(headers.iter())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for rec in &kept_records {
|
||||||
|
wtr.write_record(rec.iter())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let out_bytes = wtr.into_inner().context("Failed to finalize CSV writer")?;
|
||||||
|
let out_str = String::from_utf8(out_bytes).context("CSV output not valid UTF-8")?;
|
||||||
|
|
||||||
|
tokio_fs::write(&clean_path, out_str)
|
||||||
|
.await
|
||||||
|
.context("Failed to write filtered GLEIF CSV clean file")?;
|
||||||
|
|
||||||
|
let success_msg = format!(
|
||||||
|
"✓ Batch attempted to remove {} LEIs from GLEIF CSV (was {} records, now {} records, removed {} rows) -> {}",
|
||||||
|
leis_to_remove.len(), original_count, new_count, removed_count, clean_path.display()
|
||||||
|
);
|
||||||
|
println!("{}", success_msg);
|
||||||
|
logger::log_info(&success_msg).await;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -385,23 +960,26 @@ pub async fn load_or_build_all_securities(
|
|||||||
HashMap<String, HashMap<String, OptionInfo>>
|
HashMap<String, HashMap<String, OptionInfo>>
|
||||||
)> {
|
)> {
|
||||||
// Load existing data
|
// Load existing data
|
||||||
let mut companies = load_from_cache("data/corporate/by_name/common_stocks.json").await?
|
let mut commons = load_from_cache("data/corporate/by_name/common_stocks.json").await?
|
||||||
.unwrap_or_else(HashMap::new);
|
.unwrap_or_else(HashMap::new);
|
||||||
let mut warrants = load_from_cache("data/corporate/by_name/warrants.json").await?
|
let mut warrants = load_from_cache("data/corporate/by_name/warrants.json").await?
|
||||||
.unwrap_or_else(HashMap::new);
|
.unwrap_or_else(HashMap::new);
|
||||||
let mut options = load_from_cache("data/corporate/by_name/options.json").await?
|
let mut options = load_from_cache("data/corporate/by_name/options.json").await?
|
||||||
.unwrap_or_else(HashMap::new);
|
.unwrap_or_else(HashMap::new);
|
||||||
|
/*let mut preferred = load_from_cache("data/corporate/by_name/preferred.json").await?
|
||||||
|
.unwrap_or_else(HashMap::new);*/
|
||||||
|
|
||||||
|
|
||||||
println!("Loaded existing data:");
|
println!("Loaded existing data:");
|
||||||
println!(" - Companies: {}", companies.len());
|
println!(" - Companies: {}", commons.len());
|
||||||
println!(" - Warrants: {}", warrants.len());
|
println!(" - Warrants: {}", warrants.len());
|
||||||
println!(" - Options: {}", options.len());
|
println!(" - Options: {}", options.len());
|
||||||
|
|
||||||
let mut stats = ProcessingStats::new(companies.len(), warrants.len(), options.len());
|
let mut stats = ProcessingStats::new(commons.len(), warrants.len(), options.len());
|
||||||
|
|
||||||
println!("Processing {} LEI entries from FIGI data...", figi_to_lei.len());
|
println!("Processing {} LEI entries from FIGI data...", figi_to_lei.len());
|
||||||
|
|
||||||
for (lei, figi_infos) in figi_to_lei.iter() {
|
for (_lei, figi_infos) in figi_to_lei.iter() {
|
||||||
if figi_infos.is_empty() {
|
if figi_infos.is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -412,7 +990,7 @@ pub async fn load_or_build_all_securities(
|
|||||||
let mut option_securities = Vec::new();
|
let mut option_securities = Vec::new();
|
||||||
|
|
||||||
for figi_info in figi_infos {
|
for figi_info in figi_infos {
|
||||||
match figi_info.securityType.as_str() {
|
match figi_info.security_type.as_str() {
|
||||||
"Common Stock" => common_stocks.push(figi_info.clone()),
|
"Common Stock" => common_stocks.push(figi_info.clone()),
|
||||||
"Equity WRT" => warrant_securities.push(figi_info.clone()),
|
"Equity WRT" => warrant_securities.push(figi_info.clone()),
|
||||||
"Equity Option" => option_securities.push(figi_info.clone()),
|
"Equity Option" => option_securities.push(figi_info.clone()),
|
||||||
@@ -422,7 +1000,7 @@ pub async fn load_or_build_all_securities(
|
|||||||
|
|
||||||
// Process common stocks -> companies
|
// Process common stocks -> companies
|
||||||
if !common_stocks.is_empty() {
|
if !common_stocks.is_empty() {
|
||||||
process_common_stocks(&mut companies, &common_stocks, &mut stats);
|
process_common_stocks(&mut commons, &common_stocks, &mut stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process warrants
|
// Process warrants
|
||||||
@@ -436,14 +1014,14 @@ pub async fn load_or_build_all_securities(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
stats.print_summary(companies.len(), warrants.len(), options.len());
|
stats.print_summary(commons.len(), warrants.len(), options.len());
|
||||||
|
|
||||||
// Save all three HashMaps
|
// Save all three HashMaps
|
||||||
save_to_cache("data/corporate/by_name/common_stocks.json", &companies).await?;
|
save_to_cache("data/corporate/by_name/common_stocks.json", &commons).await?;
|
||||||
save_to_cache("data/corporate/by_name/warrants.json", &warrants).await?;
|
save_to_cache("data/corporate/by_name/warrants.json", &warrants).await?;
|
||||||
save_to_cache("data/corporate/by_name/options.json", &options).await?;
|
save_to_cache("data/corporate/by_name/options.json", &options).await?;
|
||||||
|
|
||||||
Ok((companies, warrants, options))
|
Ok((commons, warrants, options))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Statistics tracker for processing
|
/// Statistics tracker for processing
|
||||||
@@ -804,10 +1382,11 @@ where
|
|||||||
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
|
pub async fn load_figi_type_lists() -> anyhow::Result<()> {
|
||||||
println!("Loading OpenFIGI mapping value lists...");
|
println!("Loading OpenFIGI mapping value lists...");
|
||||||
|
|
||||||
let client = OpenFigiClient::new()?;
|
let client = OpenFigiClient::new().await?;
|
||||||
|
|
||||||
// Create cache directory
|
// Create cache directory
|
||||||
let cache_dir = Path::new("data/openfigi");
|
let dir = DataPaths::new(".")?;
|
||||||
|
let cache_dir = dir.cache_openfigi_dir();
|
||||||
tokio_fs::create_dir_all(cache_dir).await
|
tokio_fs::create_dir_all(cache_dir).await
|
||||||
.context("Failed to create data/openfigi directory")?;
|
.context("Failed to create data/openfigi directory")?;
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
// src/corporate/scraper.rs
|
// src/corporate/scraper.rs
|
||||||
use super::{types::*, helpers::*, openfigi::*};
|
use super::{types::*, helpers::*, openfigi::*};
|
||||||
//use crate::corporate::openfigi::OpenFigiClient;
|
//use crate::corporate::openfigi::OpenFigiClient;
|
||||||
use crate::{scraper::webdriver::*};
|
use crate::{scraper::webdriver::*, util::directories::DataPaths, util::logger};
|
||||||
use fantoccini::{Client, Locator};
|
use fantoccini::{Client, Locator};
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
use chrono::{DateTime, Duration, NaiveDate, Utc};
|
use chrono::{DateTime, Duration, NaiveDate, Utc};
|
||||||
@@ -15,160 +15,6 @@ use anyhow::{anyhow, Result};
|
|||||||
|
|
||||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
||||||
|
|
||||||
/// Discover all exchanges where this ISIN trades by querying Yahoo Finance and enriching with OpenFIGI API calls.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `isin` - The ISIN to search for.
|
|
||||||
/// * `known_ticker` - A known ticker symbol for fallback or initial check.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of FigiInfo structs containing enriched data from API calls.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if HTTP requests fail, JSON parsing fails, or OpenFIGI API responds with an error.
|
|
||||||
pub async fn discover_available_exchanges(isin: &str, known_ticker: &str) -> anyhow::Result<Vec<FigiInfo>> {
|
|
||||||
println!(" Discovering exchanges for ISIN {}", isin);
|
|
||||||
|
|
||||||
let mut potential: Vec<(String, PrimaryInfo)> = Vec::new();
|
|
||||||
|
|
||||||
// Try the primary ticker first
|
|
||||||
if let Ok(info) = check_ticker_exists(known_ticker).await {
|
|
||||||
potential.push((known_ticker.to_string(), info));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Search for ISIN directly on Yahoo to find other listings
|
|
||||||
let search_url = format!(
|
|
||||||
"https://query2.finance.yahoo.com/v1/finance/search?q={}"esCount=20&newsCount=0",
|
|
||||||
isin
|
|
||||||
);
|
|
||||||
|
|
||||||
let resp = HttpClient::new()
|
|
||||||
.get(&search_url)
|
|
||||||
.header("User-Agent", USER_AGENT)
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let json = resp.json::<Value>().await?;
|
|
||||||
|
|
||||||
if let Some(quotes) = json["quotes"].as_array() {
|
|
||||||
for quote in quotes {
|
|
||||||
// First: filter by quoteType directly from search results (faster rejection)
|
|
||||||
let quote_type = quote["quoteType"].as_str().unwrap_or("");
|
|
||||||
if quote_type.to_uppercase() != "EQUITY" {
|
|
||||||
continue; // Skip bonds, ETFs, mutual funds, options, etc.
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(symbol) = quote["symbol"].as_str() {
|
|
||||||
// Avoid duplicates
|
|
||||||
if potential.iter().any(|(s, _)| s == symbol) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Double-check with full quote data (some search results are misleading)
|
|
||||||
if let Ok(info) = check_ticker_exists(symbol).await {
|
|
||||||
potential.push((symbol.to_string(), info));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if potential.is_empty() {
|
|
||||||
return Ok(vec![]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enrich with OpenFIGI API
|
|
||||||
let client = OpenFigiClient::new()?;
|
|
||||||
|
|
||||||
let mut discovered_figis = Vec::new();
|
|
||||||
|
|
||||||
if !client.has_key() {
|
|
||||||
// Fallback without API key - create FigiInfo with default/empty fields
|
|
||||||
for (symbol, info) in potential {
|
|
||||||
println!(" Found equity listing: {} on {} ({}) - no FIGI (fallback mode)", symbol, info.exchange_mic, info.currency);
|
|
||||||
let figi_info = FigiInfo {
|
|
||||||
isin: info.isin,
|
|
||||||
figi: String::new(),
|
|
||||||
name: info.name,
|
|
||||||
ticker: symbol,
|
|
||||||
mic_code: info.exchange_mic,
|
|
||||||
currency: info.currency,
|
|
||||||
compositeFIGI: String::new(),
|
|
||||||
securityType: String::new(),
|
|
||||||
marketSector: String::new(),
|
|
||||||
shareClassFIGI: String::new(),
|
|
||||||
securityType2: String::new(),
|
|
||||||
securityDescription: String::new(),
|
|
||||||
};
|
|
||||||
discovered_figis.push(figi_info);
|
|
||||||
}
|
|
||||||
return Ok(discovered_figis);
|
|
||||||
}
|
|
||||||
|
|
||||||
// With API key, batch the mapping requests
|
|
||||||
let chunk_size = 100;
|
|
||||||
for chunk in potential.chunks(chunk_size) {
|
|
||||||
let mut jobs = vec![];
|
|
||||||
for (symbol, info) in chunk {
|
|
||||||
jobs.push(json!({
|
|
||||||
"idType": "TICKER",
|
|
||||||
"idValue": symbol,
|
|
||||||
"micCode": info.exchange_mic,
|
|
||||||
"marketSecDes": "Equity",
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
let resp = client.get_figi_client()
|
|
||||||
.post("https://api.openfigi.com/v3/mapping")
|
|
||||||
.header("Content-Type", "application/json")
|
|
||||||
.json(&jobs)
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
return Err(anyhow::anyhow!("OpenFIGI mapping failed with status: {}", resp.status()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsed: Vec<Value> = resp.json().await?;
|
|
||||||
|
|
||||||
for (i, item) in parsed.iter().enumerate() {
|
|
||||||
let (symbol, info) = &chunk[i];
|
|
||||||
if let Some(data) = item["data"].as_array() {
|
|
||||||
if let Some(entry) = data.first() {
|
|
||||||
let market_sec = entry["marketSector"].as_str().unwrap_or("");
|
|
||||||
if market_sec != "Equity" {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
println!(" Found equity listing: {} on {} ({}) - FIGI: {}", symbol, info.exchange_mic, info.currency, entry["figi"]);
|
|
||||||
let figi_info = FigiInfo {
|
|
||||||
isin: info.isin.clone(),
|
|
||||||
figi: entry["figi"].as_str().unwrap_or("").to_string(),
|
|
||||||
name: entry["name"].as_str().unwrap_or(&info.name).to_string(),
|
|
||||||
ticker: symbol.clone(),
|
|
||||||
mic_code: info.exchange_mic.clone(),
|
|
||||||
currency: info.currency.clone(),
|
|
||||||
compositeFIGI: entry["compositeFIGI"].as_str().unwrap_or("").to_string(),
|
|
||||||
securityType: entry["securityType"].as_str().unwrap_or("").to_string(),
|
|
||||||
marketSector: market_sec.to_string(),
|
|
||||||
shareClassFIGI: entry["shareClassFIGI"].as_str().unwrap_or("").to_string(),
|
|
||||||
securityType2: entry["securityType2"].as_str().unwrap_or("").to_string(),
|
|
||||||
securityDescription: entry["securityDescription"].as_str().unwrap_or("").to_string(),
|
|
||||||
};
|
|
||||||
discovered_figis.push(figi_info);
|
|
||||||
} else {
|
|
||||||
println!(" No data returned for ticker {} on MIC {}", symbol, info.exchange_mic);
|
|
||||||
}
|
|
||||||
} else if let Some(error) = item["error"].as_str() {
|
|
||||||
println!(" OpenFIGI error for ticker {}: {}", symbol, error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Respect rate limit (6 seconds between requests with key)
|
|
||||||
sleep(TokioDuration::from_secs(6)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(discovered_figis)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if a ticker exists on Yahoo Finance and return core metadata.
|
/// Check if a ticker exists on Yahoo Finance and return core metadata.
|
||||||
///
|
///
|
||||||
/// This function calls the public Yahoo Finance quoteSummary endpoint and extracts:
|
/// This function calls the public Yahoo Finance quoteSummary endpoint and extracts:
|
||||||
@@ -190,7 +36,7 @@ pub async fn discover_available_exchanges(isin: &str, known_ticker: &str) -> any
|
|||||||
/// - Not an equity (ETF, bond, etc.)
|
/// - Not an equity (ETF, bond, etc.)
|
||||||
/// - Missing critical fields
|
/// - Missing critical fields
|
||||||
/// - Network or JSON parsing errors
|
/// - Network or JSON parsing errors
|
||||||
pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
/*pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
||||||
let url = format!(
|
let url = format!(
|
||||||
"https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}?modules=price%2CassetProfile",
|
"https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}?modules=price%2CassetProfile",
|
||||||
ticker
|
ticker
|
||||||
@@ -303,34 +149,7 @@ pub async fn check_ticker_exists(ticker: &str) -> anyhow::Result<PrimaryInfo> {
|
|||||||
exchange_mic,
|
exchange_mic,
|
||||||
currency,
|
currency,
|
||||||
})
|
})
|
||||||
}
|
}*/
|
||||||
|
|
||||||
/// Convert Yahoo's exchange name to MIC code (best effort)
|
|
||||||
fn exchange_name_to_mic(name: &str) -> String {
|
|
||||||
match name {
|
|
||||||
"NMS" | "NasdaqGS" | "NASDAQ" => "XNAS",
|
|
||||||
"NYQ" | "NYSE" => "XNYS",
|
|
||||||
"LSE" | "London" => "XLON",
|
|
||||||
"FRA" | "Frankfurt" | "GER" | "XETRA" => "XFRA",
|
|
||||||
"PAR" | "Paris" => "XPAR",
|
|
||||||
"AMS" | "Amsterdam" => "XAMS",
|
|
||||||
"MIL" | "Milan" => "XMIL",
|
|
||||||
"JPX" | "Tokyo" => "XJPX",
|
|
||||||
"HKG" | "Hong Kong" => "XHKG",
|
|
||||||
"SHH" | "Shanghai" => "XSHG",
|
|
||||||
"SHZ" | "Shenzhen" => "XSHE",
|
|
||||||
"TOR" | "Toronto" => "XTSE",
|
|
||||||
"ASX" | "Australia" => "XASX",
|
|
||||||
"SAU" | "Saudi" => "XSAU",
|
|
||||||
"SWX" | "Switzerland" => "XSWX",
|
|
||||||
"BSE" | "Bombay" => "XBSE",
|
|
||||||
"NSE" | "NSI" => "XNSE",
|
|
||||||
"TAI" | "Taiwan" => "XTAI",
|
|
||||||
"SAO" | "Sao Paulo" => "BVMF",
|
|
||||||
"MCE" | "Madrid" => "XMAD",
|
|
||||||
_ => name, // Fallback to name itself
|
|
||||||
}.to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
/// Fetches earnings events for a ticker using a dedicated ScrapeTask.
|
||||||
///
|
///
|
||||||
@@ -670,60 +489,164 @@ pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow
|
|||||||
|
|
||||||
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
||||||
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
|
||||||
let zip_path = "data/gleif/isin_lei.zip";
|
|
||||||
let csv_path = "data/gleif/isin_lei.csv";
|
|
||||||
|
|
||||||
if let Err(e) = std::fs::create_dir_all("data") {
|
// Initialize DataPaths and create cache/gleif directory
|
||||||
println!("Failed to create data directory: {e}");
|
let paths = DataPaths::new(".")?;
|
||||||
|
let gleif_cache_dir = paths.cache_gleif_dir();
|
||||||
|
|
||||||
|
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
|
||||||
|
let msg = format!("Failed to create cache/gleif directory: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download ZIP
|
logger::log_info("Corporate Scraper: Downloading ISIN/LEI mapping from GLEIF...").await;
|
||||||
let bytes = match reqwest::Client::builder()
|
|
||||||
|
// Download ZIP and get the filename from Content-Disposition header
|
||||||
|
let client = match reqwest::Client::builder()
|
||||||
.user_agent(USER_AGENT)
|
.user_agent(USER_AGENT)
|
||||||
.timeout(std::time::Duration::from_secs(30))
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
.build()
|
.build()
|
||||||
.and_then(|c| Ok(c))
|
|
||||||
{
|
{
|
||||||
Ok(client) => match client.get(url).send().await {
|
Ok(c) => c,
|
||||||
Ok(resp) if resp.status().is_success() => match resp.bytes().await {
|
|
||||||
Ok(b) => b,
|
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("Failed to read ZIP bytes: {e}");
|
let msg = format!("Failed to create HTTP client: {}", e);
|
||||||
return Ok(None);
|
logger::log_error(&msg).await;
|
||||||
}
|
println!("{}", msg);
|
||||||
},
|
|
||||||
Ok(resp) => {
|
|
||||||
println!("Server returned HTTP {}", resp.status());
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
println!("Failed to download ISIN/LEI ZIP: {e}");
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
println!("Failed to create HTTP client: {e}");
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Err(e) = tokio::fs::write(zip_path, &bytes).await {
|
let resp = match client.get(url).send().await {
|
||||||
println!("Failed to write ZIP file: {e}");
|
Ok(r) if r.status().is_success() => r,
|
||||||
|
Ok(resp) => {
|
||||||
|
let msg = format!("Server returned HTTP {}", resp.status());
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let msg = format!("Failed to download ISIN/LEI ZIP: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract filename from Content-Disposition header or use default
|
||||||
|
let filename = resp
|
||||||
|
.headers()
|
||||||
|
.get("content-disposition")
|
||||||
|
.and_then(|h| h.to_str().ok())
|
||||||
|
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
|
||||||
|
.unwrap_or_else(|| "isin_lei.zip".to_string());
|
||||||
|
|
||||||
|
// Parse timestamp from filename and convert to DDMMYYYY format
|
||||||
|
let parsed_filename = parse_gleif_filename(&filename);
|
||||||
|
logger::log_info(&format!("Corporate Scraper: Downloaded file: {} -> {}", filename, parsed_filename)).await;
|
||||||
|
|
||||||
|
// Determine date (DDMMYYYY) from parsed filename: "isin-lei-DDMMYYYY.csv"
|
||||||
|
let mut date_str = String::new();
|
||||||
|
if let Some(start_idx) = parsed_filename.find("isin-lei-") {
|
||||||
|
let rest = &parsed_filename[start_idx + 9..];
|
||||||
|
if rest.len() >= 8 {
|
||||||
|
date_str = rest[0..8].to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we parsed a date, use/create a date folder under cache/gleif and operate inside it; otherwise use cache root.
|
||||||
|
let date_dir = if !date_str.is_empty() {
|
||||||
|
let p = gleif_cache_dir.join(&date_str);
|
||||||
|
// Ensure the date folder exists (create if necessary)
|
||||||
|
if let Err(e) = std::fs::create_dir_all(&p) {
|
||||||
|
let msg = format!("Failed to create date directory {:?}: {}", p, e);
|
||||||
|
logger::log_warn(&msg).await;
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(p)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// Choose the directory where we'll look for existing files and where we'll save the new ones
|
||||||
|
let target_dir = date_dir.clone().unwrap_or_else(|| gleif_cache_dir.to_path_buf());
|
||||||
|
|
||||||
|
// If the date folder exists (or was created), prefer any *_clean.csv inside it and return that immediately
|
||||||
|
if let Some(ref ddir) = date_dir {
|
||||||
|
if let Ok(entries) = std::fs::read_dir(ddir) {
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
if let Some(name) = entry.file_name().to_str() {
|
||||||
|
if name.to_lowercase().ends_with("_clean.csv") {
|
||||||
|
let path = ddir.join(name);
|
||||||
|
logger::log_info(&format!("Found existing clean GLEIF CSV: {}", path.display())).await;
|
||||||
|
return Ok(Some(path.to_string_lossy().to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no clean file found in the date folder (or date folder doesn't exist), check whether the csv/zip already exist in the target dir
|
||||||
|
let csv_candidate_name = parsed_filename.replace(".zip", ".csv");
|
||||||
|
let csv_candidate = target_dir.join(&csv_candidate_name);
|
||||||
|
let zip_candidate = target_dir.join(&parsed_filename);
|
||||||
|
|
||||||
|
if csv_candidate.exists() {
|
||||||
|
logger::log_info(&format!("Found existing GLEIF CSV: {}", csv_candidate.display())).await;
|
||||||
|
return Ok(Some(csv_candidate.to_string_lossy().to_string()));
|
||||||
|
}
|
||||||
|
if zip_candidate.exists() {
|
||||||
|
// If zip exists but csv does not, extract later; for now prefer returning csv path (may be created by extraction step)
|
||||||
|
let inferred_csv = target_dir.join(csv_candidate_name);
|
||||||
|
if inferred_csv.exists() {
|
||||||
|
logger::log_info(&format!("Found existing extracted CSV next to ZIP: {}", inferred_csv.display())).await;
|
||||||
|
return Ok(Some(inferred_csv.to_string_lossy().to_string()));
|
||||||
|
}
|
||||||
|
// otherwise we'll overwrite/extract into target_dir below
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = match resp.bytes().await {
|
||||||
|
Ok(b) => b,
|
||||||
|
Err(e) => {
|
||||||
|
let msg = format!("Failed to read ZIP bytes: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// Ensure target directory exists (create if it's the date folder and was absent earlier)
|
||||||
|
if let Some(ref ddir) = date_dir {
|
||||||
|
let _ = std::fs::create_dir_all(ddir);
|
||||||
|
}
|
||||||
|
|
||||||
|
let zip_path = target_dir.join(&parsed_filename);
|
||||||
|
let csv_path = target_dir.join(parsed_filename.replace(".zip", ".csv"));
|
||||||
|
|
||||||
|
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
|
||||||
|
let msg = format!("Failed to write ZIP file: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
logger::log_info(&format!("Corporate Scraper: Saved ZIP to {:?}", zip_path)).await;
|
||||||
|
|
||||||
// Extract CSV
|
// Extract CSV
|
||||||
let archive = match std::fs::File::open(zip_path)
|
let archive = match std::fs::File::open(&zip_path)
|
||||||
.map(ZipArchive::new)
|
.map(ZipArchive::new)
|
||||||
{
|
{
|
||||||
Ok(Ok(a)) => a,
|
Ok(Ok(a)) => a,
|
||||||
Ok(Err(e)) => {
|
Ok(Err(e)) => {
|
||||||
println!("Invalid ZIP: {e}");
|
let msg = format!("Invalid ZIP: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("Cannot open ZIP file: {e}");
|
let msg = format!("Cannot open ZIP file: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -737,7 +660,9 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
}) {
|
}) {
|
||||||
Some(i) => i,
|
Some(i) => i,
|
||||||
None => {
|
None => {
|
||||||
println!("ZIP did not contain a CSV file");
|
let msg = "ZIP did not contain a CSV file";
|
||||||
|
logger::log_error(msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -745,23 +670,55 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
|
|||||||
let mut csv_file = match archive.by_index(idx) {
|
let mut csv_file = match archive.by_index(idx) {
|
||||||
Ok(f) => f,
|
Ok(f) => f,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("Failed to read CSV entry: {e}");
|
let msg = format!("Failed to read CSV entry: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
|
println!("{}", msg);
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut csv_bytes = Vec::new();
|
let mut csv_bytes = Vec::new();
|
||||||
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
|
||||||
println!("Failed to extract CSV: {e}");
|
let msg = format!("Failed to extract CSV: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = tokio::fs::write(csv_path, &csv_bytes).await {
|
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
|
||||||
println!("Failed to save CSV file: {e}");
|
let msg = format!("Failed to save CSV file: {}", e);
|
||||||
|
logger::log_error(&msg).await;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Some(csv_path.to_string()))
|
let msg = format!("✓ ISIN/LEI CSV extracted: {:?}", csv_path);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
|
Ok(Some(csv_path.to_string_lossy().to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse GLEIF filename and convert timestamp to DDMMYYYY format
|
||||||
|
/// Example: "isin-lei-20251124T080254.csv" -> "isin-lei-24112025.csv"
|
||||||
|
fn parse_gleif_filename(filename: &str) -> String {
|
||||||
|
// Try to find pattern: isin-lei-YYYYMMDDTHHMMSS.zip/csv
|
||||||
|
if let Some(start_idx) = filename.find("isin-lei-") {
|
||||||
|
let rest = &filename[start_idx + 9..]; // After "isin-lei-"
|
||||||
|
|
||||||
|
// Extract the 8 digits (YYYYMMDD)
|
||||||
|
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
|
||||||
|
let date_part = &rest[0..8];
|
||||||
|
// date_part is YYYYMMDD, convert to DDMMYYYY
|
||||||
|
if date_part.len() == 8 {
|
||||||
|
let year = &date_part[0..4];
|
||||||
|
let month = &date_part[4..6];
|
||||||
|
let day = &date_part[6..8];
|
||||||
|
let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" };
|
||||||
|
return format!("isin-lei-{}{}{}{}", day, month, year, extension);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: return original filename if parsing fails
|
||||||
|
filename.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,20 +1,24 @@
|
|||||||
// src/corporate/storage.rs
|
// src/corporate/storage.rs
|
||||||
use super::{types::*, helpers::*};
|
use super::{types::*, helpers::*};
|
||||||
use crate::config;
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
|
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
use chrono::{Datelike, NaiveDate};
|
use chrono::{Datelike, NaiveDate};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{PathBuf};
|
||||||
|
|
||||||
pub async fn load_existing_events() -> anyhow::Result<HashMap<String, CompanyEvent>> {
|
pub async fn load_existing_events(paths: &DataPaths) -> anyhow::Result<HashMap<String, CompanyEvent>> {
|
||||||
let mut map = HashMap::new();
|
let mut map = HashMap::new();
|
||||||
let dir = std::path::Path::new("corporate_events");
|
let dir = paths.corporate_events_dir();
|
||||||
if !dir.exists() {
|
if !dir.exists() {
|
||||||
|
logger::log_info("Corporate Storage: No existing events directory found").await;
|
||||||
return Ok(map);
|
return Ok(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut entries = fs::read_dir(dir).await?;
|
let mut entries = fs::read_dir(dir).await?;
|
||||||
|
let mut loaded_count = 0;
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
if path.extension().and_then(|s| s.to_str()) == Some("json") {
|
||||||
@@ -25,25 +29,32 @@ pub async fn load_existing_events() -> anyhow::Result<HashMap<String, CompanyEve
|
|||||||
for event in events {
|
for event in events {
|
||||||
map.insert(event_key(&event), event);
|
map.insert(event_key(&event), event);
|
||||||
}
|
}
|
||||||
|
loaded_count += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
logger::log_info(&format!("Corporate Storage: Loaded {} events from {} files", map.len(), loaded_count)).await;
|
||||||
Ok(map)
|
Ok(map)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_optimized_events(events: HashMap<String, CompanyEvent>) -> anyhow::Result<()> {
|
pub async fn save_optimized_events(paths: &DataPaths, events: HashMap<String, CompanyEvent>) -> anyhow::Result<()> {
|
||||||
let dir = std::path::Path::new("corporate_events");
|
let dir = paths.corporate_events_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
|
|
||||||
|
logger::log_info("Corporate Storage: Removing old event files...").await;
|
||||||
|
let mut removed_count = 0;
|
||||||
let mut entries = fs::read_dir(dir).await?;
|
let mut entries = fs::read_dir(dir).await?;
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||||
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||||
fs::remove_file(&path).await?;
|
fs::remove_file(&path).await?;
|
||||||
|
removed_count += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
logger::log_info(&format!("Corporate Storage: Removed {} old event files", removed_count)).await;
|
||||||
|
|
||||||
|
let total_events = events.len();
|
||||||
let mut sorted: Vec<_> = events.into_values().collect();
|
let mut sorted: Vec<_> = events.into_values().collect();
|
||||||
sorted.sort_by_key(|e| (e.ticker.clone(), e.date.clone()));
|
sorted.sort_by_key(|e| (e.ticker.clone(), e.date.clone()));
|
||||||
|
|
||||||
@@ -55,18 +66,26 @@ pub async fn save_optimized_events(events: HashMap<String, CompanyEvent>) -> any
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let total_months = by_month.len();
|
||||||
for (month, list) in by_month {
|
for (month, list) in by_month {
|
||||||
let path = dir.join(format!("events_{}.json", month));
|
let path = dir.join(format!("events_{}.json", month));
|
||||||
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
|
||||||
|
logger::log_info(&format!("Corporate Storage: Saved {} events for month {}", list.len(), month)).await;
|
||||||
}
|
}
|
||||||
|
logger::log_info(&format!("Corporate Storage: Saved {} total events in {} month files", total_events, total_months)).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_changes(changes: &[CompanyEventChange]) -> anyhow::Result<()> {
|
pub async fn save_changes(paths: &DataPaths, changes: &[CompanyEventChange]) -> anyhow::Result<()> {
|
||||||
if changes.is_empty() { return Ok(()); }
|
if changes.is_empty() {
|
||||||
let dir = std::path::Path::new("corporate_event_changes");
|
logger::log_info("Corporate Storage: No changes to save").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let dir = paths.corporate_changes_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!("Corporate Storage: Saving {} changes", changes.len())).await;
|
||||||
|
|
||||||
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
|
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
|
||||||
for c in changes {
|
for c in changes {
|
||||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||||
@@ -81,14 +100,16 @@ pub async fn save_changes(changes: &[CompanyEventChange]) -> anyhow::Result<()>
|
|||||||
let s = fs::read_to_string(&path).await?;
|
let s = fs::read_to_string(&path).await?;
|
||||||
serde_json::from_str(&s).unwrap_or_default()
|
serde_json::from_str(&s).unwrap_or_default()
|
||||||
} else { vec![] };
|
} else { vec![] };
|
||||||
all.extend(list);
|
all.extend(list.clone());
|
||||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||||
|
logger::log_info(&format!("Corporate Storage: Saved {} changes for month {}", list.len(), month)).await;
|
||||||
}
|
}
|
||||||
|
logger::log_info("Corporate Storage: All changes saved successfully").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_prices_for_ticker(ticker: &str, timeframe: &str, mut prices: Vec<CompanyPrice>) -> anyhow::Result<()> {
|
pub async fn save_prices_for_ticker(paths: &DataPaths, ticker: &str, timeframe: &str, mut prices: Vec<CompanyPrice>) -> anyhow::Result<()> {
|
||||||
let base_dir = Path::new("corporate_prices");
|
let base_dir = paths.corporate_prices_dir();
|
||||||
let company_dir = base_dir.join(ticker.replace(".", "_"));
|
let company_dir = base_dir.join(ticker.replace(".", "_"));
|
||||||
let timeframe_dir = company_dir.join(timeframe);
|
let timeframe_dir = company_dir.join(timeframe);
|
||||||
|
|
||||||
@@ -102,35 +123,35 @@ pub async fn save_prices_for_ticker(ticker: &str, timeframe: &str, mut prices: V
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_company_dir(lei: &str) -> PathBuf {
|
pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf {
|
||||||
PathBuf::from("corporate_prices").join(lei)
|
paths.corporate_prices_dir().join(lei)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn ensure_company_dirs(isin: &str) -> anyhow::Result<()> {
|
pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Result<()> {
|
||||||
let base = get_company_dir(isin);
|
let base = get_company_dir(paths, isin);
|
||||||
let paths = [
|
let paths_to_create = [
|
||||||
base.clone(),
|
base.clone(),
|
||||||
base.join("5min"),
|
base.join("5min"),
|
||||||
base.join("daily"),
|
base.join("daily"),
|
||||||
base.join("aggregated").join("5min"),
|
base.join("aggregated").join("5min"),
|
||||||
base.join("aggregated").join("daily"),
|
base.join("aggregated").join("daily"),
|
||||||
];
|
];
|
||||||
for p in paths {
|
for p in paths_to_create {
|
||||||
fs::create_dir_all(&p).await?;
|
fs::create_dir_all(&p).await?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_available_exchanges(isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
|
pub async fn save_available_exchanges(paths: &DataPaths, isin: &str, exchanges: Vec<AvailableExchange>) -> anyhow::Result<()> {
|
||||||
let dir = get_company_dir(isin);
|
let dir = get_company_dir(paths, isin);
|
||||||
fs::create_dir_all(&dir).await?;
|
fs::create_dir_all(&dir).await?;
|
||||||
let path = dir.join("available_exchanges.json");
|
let path = dir.join("available_exchanges.json");
|
||||||
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn load_available_exchanges(lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
|
pub async fn load_available_exchanges(paths: &DataPaths, lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
|
||||||
let path = get_company_dir(lei).join("available_exchanges.json");
|
let path = get_company_dir(paths, lei).join("available_exchanges.json");
|
||||||
if path.exists() {
|
if path.exists() {
|
||||||
let content = fs::read_to_string(&path).await?;
|
let content = fs::read_to_string(&path).await?;
|
||||||
Ok(serde_json::from_str(&content)?)
|
Ok(serde_json::from_str(&content)?)
|
||||||
@@ -140,13 +161,14 @@ pub async fn load_available_exchanges(lei: &str) -> anyhow::Result<Vec<Available
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_prices_by_source(
|
pub async fn save_prices_by_source(
|
||||||
|
paths: &DataPaths,
|
||||||
lei: &str,
|
lei: &str,
|
||||||
source_ticker: &str,
|
source_ticker: &str,
|
||||||
timeframe: &str,
|
timeframe: &str,
|
||||||
prices: Vec<CompanyPrice>,
|
prices: Vec<CompanyPrice>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
|
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
|
||||||
let dir = get_company_dir(lei).join(timeframe).join(&source_safe);
|
let dir = get_company_dir(paths, lei).join(timeframe).join(&source_safe);
|
||||||
fs::create_dir_all(&dir).await?;
|
fs::create_dir_all(&dir).await?;
|
||||||
let path = dir.join("prices.json");
|
let path = dir.join("prices.json");
|
||||||
let mut prices = prices;
|
let mut prices = prices;
|
||||||
@@ -156,14 +178,15 @@ pub async fn save_prices_by_source(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Update available_exchanges.json with fetch results
|
/// Update available_exchanges.json with fetch results
|
||||||
pub async fn update_available_exchange(
|
/*pub async fn update_available_exchange(
|
||||||
|
paths: &DataPaths,
|
||||||
isin: &str,
|
isin: &str,
|
||||||
ticker: &str,
|
ticker: &str,
|
||||||
exchange_mic: &str,
|
exchange_mic: &str,
|
||||||
has_daily: bool,
|
has_daily: bool,
|
||||||
has_5min: bool,
|
has_5min: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut exchanges = load_available_exchanges(isin).await?;
|
let mut exchanges = load_available_exchanges(paths, isin).await?;
|
||||||
|
|
||||||
if let Some(entry) = exchanges.iter_mut().find(|e| e.ticker == ticker) {
|
if let Some(entry) = exchanges.iter_mut().find(|e| e.ticker == ticker) {
|
||||||
// Update existing entry
|
// Update existing entry
|
||||||
@@ -181,39 +204,8 @@ pub async fn update_available_exchange(
|
|||||||
exchanges.push(new_entry);
|
exchanges.push(new_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
save_available_exchanges(isin, exchanges).await
|
save_available_exchanges(paths, isin, exchanges).await
|
||||||
}
|
}*/
|
||||||
|
|
||||||
/// Add a newly discovered exchange before fetching
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `isin` - The ISIN associated with the exchange.
|
|
||||||
/// * `figi_info` - The FigiInfo containing ticker, mic_code, and currency.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// Ok(()) on success.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if loading or saving available exchanges fails.
|
|
||||||
pub async fn add_discovered_exchange(
|
|
||||||
isin: &str,
|
|
||||||
figi_info: &FigiInfo,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let mut exchanges = load_available_exchanges(isin).await?;
|
|
||||||
|
|
||||||
// Only add if not already present
|
|
||||||
if !exchanges.iter().any(|e| e.ticker == figi_info.ticker && e.exchange_mic == figi_info.mic_code) {
|
|
||||||
let new_entry = AvailableExchange::new(
|
|
||||||
figi_info.ticker.clone(),
|
|
||||||
figi_info.mic_code.clone(),
|
|
||||||
figi_info.currency.clone(),
|
|
||||||
);
|
|
||||||
exchanges.push(new_entry);
|
|
||||||
save_available_exchanges(isin, exchanges).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Infer currency from ticker suffix
|
/// Infer currency from ticker suffix
|
||||||
fn infer_currency_from_ticker(ticker: &str) -> String {
|
fn infer_currency_from_ticker(ticker: &str) -> String {
|
||||||
@@ -235,3 +227,41 @@ fn infer_currency_from_ticker(ticker: &str) -> String {
|
|||||||
|
|
||||||
"USD".to_string() // Default
|
"USD".to_string() // Default
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Saves companies data to a JSONL file.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `paths` - Reference to DataPaths for directory management
|
||||||
|
/// * `companies` - HashMap of company names to their securities (ISIN, Ticker pairs)
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if file operations or serialization fails.
|
||||||
|
pub async fn save_companies_to_jsonl(
|
||||||
|
paths: &DataPaths,
|
||||||
|
companies: &HashMap<String, HashMap<String, String>>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let file_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
|
||||||
|
logger::log_info(&format!("Corporate Storage: Saving {} companies to JSONL", companies.len())).await;
|
||||||
|
|
||||||
|
// Create parent directory if it doesn't exist
|
||||||
|
if let Some(parent) = file_path.parent() {
|
||||||
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut file = tokio::fs::File::create(&file_path).await?;
|
||||||
|
|
||||||
|
for (name, securities) in companies.iter() {
|
||||||
|
let line = serde_json::json!({
|
||||||
|
"name": name,
|
||||||
|
"securities": securities
|
||||||
|
});
|
||||||
|
file.write_all(line.to_string().as_bytes()).await?;
|
||||||
|
file.write_all(b"\n").await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let msg = format!("✓ Saved {} companies to {:?}", companies.len(), file_path);
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
// src/corporate/types.rs
|
// src/corporate/types.rs
|
||||||
|
use std::collections::HashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
@@ -53,24 +52,19 @@ pub struct FigiInfo {
|
|||||||
pub figi: String,
|
pub figi: String,
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub ticker: String,
|
pub ticker: String,
|
||||||
pub mic_code: String,
|
pub exch_code: String,
|
||||||
pub currency: String,
|
#[serde(rename = "compositeFIGI")]
|
||||||
pub compositeFIGI: String,
|
pub composite_figi: String,
|
||||||
pub securityType: String,
|
#[serde(rename = "securityType")]
|
||||||
pub marketSector: String,
|
pub security_type: String,
|
||||||
pub shareClassFIGI: String,
|
#[serde(rename = "marketSector")]
|
||||||
pub securityType2: String,
|
pub market_sector: String,
|
||||||
pub securityDescription: String,
|
#[serde(rename = "shareClassFIGI")]
|
||||||
}
|
pub share_class_figi: String,
|
||||||
|
#[serde(rename = "securityType2")]
|
||||||
/// Company Meta Data
|
pub security_type2: String,
|
||||||
/// # Attributes
|
#[serde(rename = "securityDescription")]
|
||||||
/// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
pub security_description: String,
|
||||||
/// * figi: metadata with ISIN as key
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct CompanyMetadata {
|
|
||||||
pub lei: String,
|
|
||||||
pub figi: Option<Vec<FigiInfo>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Company Info
|
/// Company Info
|
||||||
@@ -85,6 +79,15 @@ pub struct CompanyInfo{
|
|||||||
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Company Meta Data
|
||||||
|
/// # Attributes
|
||||||
|
/// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
||||||
|
/// * figi: metadata with ISIN as key
|
||||||
|
/*#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CompanyMetadata {
|
||||||
|
pub lei: String,
|
||||||
|
pub figi: Option<Vec<FigiInfo>>,
|
||||||
|
}*/
|
||||||
|
|
||||||
/// Warrant Info
|
/// Warrant Info
|
||||||
///
|
///
|
||||||
@@ -115,13 +118,13 @@ pub struct OptionInfo {
|
|||||||
pub options: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
pub options: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo> (grouped by ISIN)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
/*#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct PrimaryInfo {
|
pub struct PrimaryInfo {
|
||||||
pub isin: String,
|
pub isin: String,
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub exchange_mic: String,
|
pub exchange_mic: String,
|
||||||
pub currency: String,
|
pub currency: String,
|
||||||
}
|
}*/
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct AvailableExchange {
|
pub struct AvailableExchange {
|
||||||
@@ -137,27 +140,3 @@ pub struct AvailableExchange {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub fetch_count: u32, // How many times successfully fetched
|
pub fetch_count: u32, // How many times successfully fetched
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AvailableExchange {
|
|
||||||
pub fn new(ticker: String, exchange_mic: String, currency: String) -> Self {
|
|
||||||
Self {
|
|
||||||
exchange_mic,
|
|
||||||
ticker,
|
|
||||||
has_daily: false,
|
|
||||||
has_5min: false,
|
|
||||||
last_successful_fetch: None,
|
|
||||||
currency,
|
|
||||||
discovered_at: Some(chrono::Local::now().format("%Y-%m-%d").to_string()),
|
|
||||||
fetch_count: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn record_success(&mut self, has_daily: bool, has_5min: bool) {
|
|
||||||
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
|
|
||||||
|
|
||||||
self.has_daily |= has_daily;
|
|
||||||
self.has_5min |= has_5min;
|
|
||||||
self.last_successful_fetch = Some(today);
|
|
||||||
self.fetch_count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,6 +1,8 @@
|
|||||||
// src/corporate/update.rs
|
// src/corporate/update.rs
|
||||||
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
|
use super::{scraper::*, storage::*, helpers::*, types::*, aggregation::*, openfigi::*};
|
||||||
use crate::config::Config;
|
use crate::config::Config;
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
use crate::scraper::webdriver::ChromeDriverPool;
|
use crate::scraper::webdriver::ChromeDriverPool;
|
||||||
|
|
||||||
use chrono::Local;
|
use chrono::Local;
|
||||||
@@ -24,50 +26,109 @@ use std::sync::Arc;
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
/// Returns an error if any step in the update process fails.
|
/// Returns an error if any step in the update process fails.
|
||||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
||||||
println!("=== Starting LEI-based corporate full update ===");
|
let msg = "=== Starting LEI-based corporate full update ===";
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(msg).await;
|
||||||
|
|
||||||
|
// Initialize paths
|
||||||
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
// 1. Load fresh GLEIF ISIN ↔ LEI mapping
|
// 1. Load fresh GLEIF ISIN ↔ LEI mapping
|
||||||
|
logger::log_info("Corporate Update: Loading GLEIF ISIN ↔ LEI mapping...").await;
|
||||||
let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
|
let lei_to_isins: HashMap<String, Vec<String>> = match load_isin_lei_csv().await {
|
||||||
Ok(map) => map,
|
Ok(map) => {
|
||||||
|
let msg = format!("Corporate Update: Loaded GLEIF mapping with {} LEI entries", map.len());
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
map
|
||||||
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Warning: Could not load GLEIF ISIN↔LEI mapping: {}", e);
|
let msg = format!("Corporate Update: Warning - Could not load GLEIF ISIN↔LEI mapping: {}", e);
|
||||||
|
eprintln!("{}", msg);
|
||||||
|
logger::log_warn(&msg).await;
|
||||||
HashMap::new()
|
HashMap::new()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 2. Load OpenFIGI mapping value lists (cached)
|
// 2. Load OpenFIGI mapping value lists (cached)
|
||||||
|
logger::log_info("Corporate Update: Loading OpenFIGI type lists...").await;
|
||||||
if let Err(e) = load_figi_type_lists().await {
|
if let Err(e) = load_figi_type_lists().await {
|
||||||
eprintln!("Warning: Could not load OpenFIGI type lists: {}", e);
|
let msg = format!("Corporate Update: Warning - Could not load OpenFIGI type lists: {}", e);
|
||||||
|
eprintln!("{}", msg);
|
||||||
|
logger::log_warn(&msg).await;
|
||||||
}
|
}
|
||||||
|
logger::log_info("Corporate Update: OpenFIGI type lists loaded").await;
|
||||||
|
|
||||||
// 3. Build FIGI → LEI map
|
// 3. Build FIGI → LEI map
|
||||||
// # Attributes
|
logger::log_info("Corporate Update: Building FIGI → LEI map...").await;
|
||||||
// * lei: Structuring the companies by legal dependencies [LEI -> Vec<ISIN>]
|
let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins, None).await {
|
||||||
// * figi: metadata with ISIN as key
|
Ok(map) => {
|
||||||
let figi_to_lei:HashMap<String, Vec<FigiInfo>> = match build_lei_to_figi_infos(&lei_to_isins).await {
|
let msg = format!("Corporate Update: Built FIGI map with {} entries", map.len());
|
||||||
Ok(map) => map,
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
map
|
||||||
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Warning: Could not build FIGI→LEI map: {}", e);
|
let msg = format!("Corporate Update: Warning - Could not build FIGI→LEI map: {}", e);
|
||||||
|
eprintln!("{}", msg);
|
||||||
|
logger::log_warn(&msg).await;
|
||||||
HashMap::new()
|
HashMap::new()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 4. Load or build companies
|
// 4. Load or build companies
|
||||||
let mut companies = load_or_build_all_securities(&figi_to_lei).await?;
|
logger::log_info("Corporate Update: Loading/building company securities...").await;
|
||||||
println!("Processing {} companies", companies.0.len());
|
let securities = load_or_build_all_securities(&figi_to_lei).await?;
|
||||||
|
let msg = format!("Corporate Update: Processing {} companies", securities.0.len());
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
|
// HashMap<Name, HashMap<ISIN, Ticker>> - unique pairs only
|
||||||
|
let companies: HashMap<String, HashMap<String, String>> = securities.0
|
||||||
|
.iter()
|
||||||
|
.fold(HashMap::new(), |mut acc, security| {
|
||||||
|
let mut isin_ticker_pairs: HashMap<String, String> = HashMap::new();
|
||||||
|
|
||||||
|
// Collect all unique ISIN-Ticker pairs
|
||||||
|
for figi_infos in security.1.securities.values() {
|
||||||
|
for figi_info in figi_infos {
|
||||||
|
if !figi_info.isin.is_empty() && !figi_info.ticker.is_empty() {
|
||||||
|
isin_ticker_pairs.insert(figi_info.isin.clone(), figi_info.ticker.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only add if there are pairs
|
||||||
|
if !isin_ticker_pairs.is_empty() {
|
||||||
|
acc.insert(security.1.name.clone(), isin_ticker_pairs);
|
||||||
|
}
|
||||||
|
acc
|
||||||
|
});
|
||||||
|
|
||||||
|
logger::log_info(&format!("Corporate Update: Saving {} companies to JSONL", companies.len())).await;
|
||||||
|
save_companies_to_jsonl(&paths, &companies).await.expect("Failed to save companies List.");
|
||||||
|
logger::log_info("Corporate Update: Companies saved successfully").await;
|
||||||
|
|
||||||
// 5. Load existing earnings events (for change detection)
|
// 5. Load existing earnings events (for change detection)
|
||||||
let today = Local::now().format("%Y-%m-%d").to_string();
|
logger::log_info("Corporate Update: Loading existing events...").await;
|
||||||
let mut existing_events = match load_existing_events().await {
|
let existing_events = match load_existing_events(&paths).await {
|
||||||
Ok(events) => events,
|
Ok(events) => {
|
||||||
|
let msg = format!("Corporate Update: Loaded {} existing events", events.len());
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
events
|
||||||
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Warning: Could not load existing events: {}", e);
|
let msg = format!("Corporate Update: Warning - Could not load existing events: {}", e);
|
||||||
|
eprintln!("{}", msg);
|
||||||
|
logger::log_warn(&msg).await;
|
||||||
HashMap::new()
|
HashMap::new()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 5. Use the provided pool (no need to create a new one)
|
// 5. Use the provided pool (no need to create a new one)
|
||||||
let pool_size = pool.get_number_of_instances(); // Use the size from the shared pool
|
let pool_size = pool.get_number_of_instances(); // Use the size from the shared pool
|
||||||
|
logger::log_info(&format!("Corporate Update: Using pool size: {}", pool_size)).await;
|
||||||
|
|
||||||
// Process companies in parallel using the shared pool
|
// Process companies in parallel using the shared pool
|
||||||
/*let results: Vec<_> = stream::iter(companies.into_iter())
|
/*let results: Vec<_> = stream::iter(companies.into_iter())
|
||||||
@@ -88,10 +149,14 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
|
|||||||
}
|
}
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
save_optimized_events(existing_events).await?;
|
logger::log_info(&format!("Corporate Update: Saving {} events to optimized storage", existing_events.len())).await;
|
||||||
|
save_optimized_events(&paths, existing_events).await?;
|
||||||
|
logger::log_info("Corporate Update: Events saved successfully").await;
|
||||||
//save_changes(&all_changes).await?;
|
//save_changes(&all_changes).await?;
|
||||||
|
|
||||||
//println!("Corporate update complete — {} changes detected", all_changes.len());
|
let msg = "✓ Corporate update complete";
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(msg).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,39 +7,10 @@ const EXTRACTION_JS: &str = include_str!("extraction_script.js");
|
|||||||
|
|
||||||
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
pub async fn goto_and_prepare(client: &Client) -> anyhow::Result<()> {
|
||||||
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
|
client.goto("https://www.finanzen.net/termine/wirtschaftsdaten/").await?;
|
||||||
//dismiss_overlays(client).await?;
|
|
||||||
|
|
||||||
/*if let Ok(tab) = client.find(fantoccini::Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
|
||||||
tab.click().await?;
|
|
||||||
println!("High importance tab selected");
|
|
||||||
sleep(Duration::from_secs(2)).await;
|
|
||||||
}*/
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/*pub async fn dismiss_overlays(client: &Client) -> anyhow::Result<()> {
|
|
||||||
for _ in 0..10 {
|
|
||||||
let removed: bool = client
|
|
||||||
.execute(
|
|
||||||
r#"(() => {
|
|
||||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
|
||||||
if (iframe && iframe.parentNode) {
|
|
||||||
iframe.parentNode.removeChild(iframe);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
})()"#,
|
|
||||||
vec![],
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
.as_bool()
|
|
||||||
.unwrap_or(false);
|
|
||||||
if removed { break; }
|
|
||||||
sleep(Duration::from_millis(500)).await;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}*/
|
|
||||||
|
|
||||||
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
|
pub async fn set_date_range(client: &Client, start: &str, end: &str) -> anyhow::Result<()> {
|
||||||
let script = format!(
|
let script = format!(
|
||||||
r#"
|
r#"
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
// src/economic/storage.rs
|
// src/economic/storage.rs
|
||||||
use super::types::*;
|
use super::types::*;
|
||||||
use super::helpers::*;
|
use super::helpers::*;
|
||||||
|
use crate::util::directories::DataPaths;
|
||||||
|
use crate::util::logger;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use chrono::{NaiveDate, Datelike};
|
use chrono::{NaiveDate, Datelike};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
pub async fn scan_existing_chunks(paths: &DataPaths) -> anyhow::Result<Vec<ChunkInfo>> {
|
||||||
let dir = std::path::Path::new("data/economic/events");
|
let dir = paths.economic_events_dir();
|
||||||
let mut chunks = Vec::new();
|
let mut chunks = Vec::new();
|
||||||
|
|
||||||
if dir.exists() {
|
if dir.exists() {
|
||||||
@@ -29,6 +31,7 @@ pub async fn scan_existing_chunks() -> anyhow::Result<Vec<ChunkInfo>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
chunks.sort_by_key(|c| c.start_date.clone());
|
chunks.sort_by_key(|c| c.start_date.clone());
|
||||||
|
logger::log_info(&format!("Economic Storage: Scanned {} event chunks", chunks.len())).await;
|
||||||
Ok(chunks)
|
Ok(chunks)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -41,25 +44,28 @@ pub async fn load_existing_events(chunks: &[ChunkInfo]) -> anyhow::Result<HashMa
|
|||||||
map.insert(event_key(&e), e);
|
map.insert(event_key(&e), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
logger::log_info(&format!("Economic Storage: Loaded {} events from {} chunks", map.len(), chunks.len())).await;
|
||||||
Ok(map)
|
Ok(map)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
pub async fn save_optimized_chunks(paths: &DataPaths, events: HashMap<String, EconomicEvent>) -> anyhow::Result<()> {
|
||||||
let dir = std::path::Path::new("data/economic/events");
|
let dir = paths.economic_events_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
|
|
||||||
// Delete all old chunk files to prevent duplicates and overlaps
|
logger::log_info("Economic Storage: Removing old chunk files...").await;
|
||||||
println!("Removing old chunks...");
|
|
||||||
|
|
||||||
let mut entries = fs::read_dir(dir).await?;
|
let mut entries = fs::read_dir(dir).await?;
|
||||||
|
let mut removed_count = 0;
|
||||||
while let Some(entry) = entries.next_entry().await? {
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||||
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
if name.starts_with("chunk_") && path.extension().map(|e| e == "json").unwrap_or(false) {
|
||||||
fs::remove_file(&path).await?;
|
fs::remove_file(&path).await?;
|
||||||
|
removed_count += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
logger::log_info(&format!("Economic Storage: Removed {} old chunk files", removed_count)).await;
|
||||||
|
|
||||||
let mut sorted: Vec<_> = events.into_values().collect();
|
let mut sorted: Vec<_> = events.into_values().collect();
|
||||||
sorted.sort_by_key(|e| e.date.clone());
|
sorted.sort_by_key(|e| e.date.clone());
|
||||||
@@ -77,6 +83,7 @@ pub async fn save_optimized_chunks(events: HashMap<String, EconomicEvent>) -> an
|
|||||||
if !chunk.is_empty() {
|
if !chunk.is_empty() {
|
||||||
save_chunk(&chunk, dir).await?;
|
save_chunk(&chunk, dir).await?;
|
||||||
}
|
}
|
||||||
|
logger::log_info(&format!("Economic Storage: Saved all event chunks to {:?}", dir)).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,14 +92,20 @@ async fn save_chunk(events: &[EconomicEvent], dir: &std::path::Path) -> anyhow::
|
|||||||
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
let end = events.iter().map(|e| &e.date).max().unwrap().clone();
|
||||||
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
let path = dir.join(format!("chunk_{}_{}.json", start, end));
|
||||||
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(events)?).await?;
|
||||||
|
logger::log_info(&format!("Economic Storage: Saved chunk {} - {} ({} events)", start, end, events.len())).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
pub async fn save_changes(paths: &DataPaths, changes: &[EventChange]) -> anyhow::Result<()> {
|
||||||
if changes.is_empty() { return Ok(()); }
|
if changes.is_empty() {
|
||||||
let dir = std::path::Path::new("economic_event_changes");
|
logger::log_info("Economic Storage: No changes to save").await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let dir = paths.economic_changes_dir();
|
||||||
fs::create_dir_all(dir).await?;
|
fs::create_dir_all(dir).await?;
|
||||||
|
|
||||||
|
logger::log_info(&format!("Economic Storage: Saving {} changes to {:?}", changes.len(), dir)).await;
|
||||||
|
|
||||||
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
|
let mut by_month: HashMap<String, Vec<EventChange>> = HashMap::new();
|
||||||
for c in changes {
|
for c in changes {
|
||||||
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
|
||||||
@@ -107,8 +120,10 @@ pub async fn save_changes(changes: &[EventChange]) -> anyhow::Result<()> {
|
|||||||
let s = fs::read_to_string(&path).await?;
|
let s = fs::read_to_string(&path).await?;
|
||||||
serde_json::from_str(&s).unwrap_or_default()
|
serde_json::from_str(&s).unwrap_or_default()
|
||||||
} else { vec![] };
|
} else { vec![] };
|
||||||
all.extend(list);
|
all.extend(list.clone());
|
||||||
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
|
||||||
|
logger::log_info(&format!("Economic Storage: Saved {} changes for month {}", list.len(), month)).await;
|
||||||
}
|
}
|
||||||
|
logger::log_info("Economic Storage: All changes saved successfully").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
// src/economic/update.rs
|
// src/economic/update.rs
|
||||||
use super::{scraper::*, storage::*, helpers::*, types::*};
|
use super::{scraper::*, storage::*, helpers::*, types::*};
|
||||||
use crate::{config::Config, scraper::webdriver::ScrapeTask};
|
use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
|
||||||
use crate::scraper::webdriver::ChromeDriverPool;
|
|
||||||
use chrono::{Local};
|
use chrono::{Local};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
@@ -14,38 +13,69 @@ use std::sync::Arc;
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
/// Returns an error if scraping, loading, or saving fails.
|
/// Returns an error if scraping, loading, or saving fails.
|
||||||
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
||||||
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
|
logger::log_info("Economic Update: Initializing...").await;
|
||||||
|
|
||||||
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
|
||||||
let end_date = config.target_end_date();
|
let end_date = config.target_end_date();
|
||||||
|
|
||||||
let chunks = scan_existing_chunks().await?;
|
logger::log_info(&format!("Economic Update: Scanning existing chunks from {:?}", paths.economic_events_dir())).await;
|
||||||
|
let chunks = scan_existing_chunks(&paths).await?;
|
||||||
let mut events = load_existing_events(&chunks).await?;
|
let mut events = load_existing_events(&chunks).await?;
|
||||||
println!("Loaded {} events from {} chunks", events.len(), chunks.len());
|
|
||||||
|
let msg = format!("Economic Update: Loaded {} events from {} chunks", events.len(), chunks.len());
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
let start_date = if events.is_empty() {
|
let start_date = if events.is_empty() {
|
||||||
|
logger::log_warn("Economic Update: No existing events found, starting from config date").await;
|
||||||
config.economic_start_date.clone()
|
config.economic_start_date.clone()
|
||||||
} else if events.values().any(|e| e.date >= today_str) {
|
} else if events.values().any(|e| e.date >= today_str) {
|
||||||
|
logger::log_info("Economic Update: Events exist for today, starting from today").await;
|
||||||
today_str.clone()
|
today_str.clone()
|
||||||
} else {
|
} else {
|
||||||
events.values()
|
let next = events.values()
|
||||||
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
.filter_map(|e| chrono::NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||||
.max()
|
.max()
|
||||||
.and_then(|d| d.succ_opt())
|
.and_then(|d| d.succ_opt())
|
||||||
.map(|d| d.format("%Y-%m-%d").to_string())
|
.map(|d| d.format("%Y-%m-%d").to_string())
|
||||||
.unwrap_or(today_str.clone())
|
.unwrap_or(today_str.clone());
|
||||||
|
logger::log_info(&format!("Economic Update: Resuming from: {}", next)).await;
|
||||||
|
next
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("Scraping economic events: {} → {}", start_date, end_date);
|
let msg = format!("Economic Update: Scraping events from {} → {}", start_date, end_date);
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
// Pass the pool to the scraping function
|
// Pass the pool to the scraping function
|
||||||
let new_events_all = scrape_all_economic_events(&start_date, &end_date, pool).await?;
|
let new_events_all = scrape_all_economic_events(&start_date, &end_date, pool).await?;
|
||||||
|
|
||||||
|
let msg = format!("Economic Update: Scraped {} new events", new_events_all.len());
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
// Process all at once or in batches
|
// Process all at once or in batches
|
||||||
let result = process_batch(&new_events_all, &mut events, &today_str);
|
let result = process_batch(&new_events_all, &mut events, &today_str);
|
||||||
let total_changes = result.changes.len();
|
let total_changes = result.changes.len();
|
||||||
save_changes(&result.changes).await?;
|
|
||||||
|
|
||||||
save_optimized_chunks(events).await?;
|
let msg = format!("Economic Update: Detected {} changes", total_changes);
|
||||||
println!("Economic update complete — {} changes detected", total_changes);
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
|
|
||||||
|
if total_changes > 0 {
|
||||||
|
logger::log_info(&format!("Economic Update: Saving {} changes to log", total_changes)).await;
|
||||||
|
save_changes(&paths, &result.changes).await?;
|
||||||
|
logger::log_info("Economic Update: Changes saved successfully").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info(&format!("Economic Update: Saving {} total events to chunks", events.len())).await;
|
||||||
|
save_optimized_chunks(&paths, events).await?;
|
||||||
|
|
||||||
|
let msg = format!("✓ Economic update complete — {} changes detected", total_changes);
|
||||||
|
println!("{}", msg);
|
||||||
|
logger::log_info(&msg).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
8
src/lib.rs
Normal file
8
src/lib.rs
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
// src/lib.rs
|
||||||
|
//! Event Backtest Engine - Core Library
|
||||||
|
//!
|
||||||
|
//! Exposes all public modules for use in examples and tests
|
||||||
|
|
||||||
|
pub mod config;
|
||||||
|
pub mod scraper;
|
||||||
|
pub mod util;
|
||||||
46
src/main.rs
46
src/main.rs
@@ -1,18 +1,21 @@
|
|||||||
// src/main.rs
|
// src/main.rs
|
||||||
mod economic;
|
|
||||||
mod corporate;
|
|
||||||
mod config;
|
mod config;
|
||||||
|
mod corporate;
|
||||||
|
mod economic;
|
||||||
mod util;
|
mod util;
|
||||||
mod scraper;
|
mod scraper;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use config::Config;
|
use config::Config;
|
||||||
use scraper::webdriver::ChromeDriverPool;
|
use scraper::webdriver::ChromeDriverPool;
|
||||||
|
use util::directories::DataPaths;
|
||||||
|
use util::{logger, opnv};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
/// The entry point of the application.
|
/// The entry point of the application.
|
||||||
///
|
///
|
||||||
/// This function loads the configuration, initializes a shared ChromeDriver pool,
|
/// This function loads the configuration, initializes a shared ChromeDriver pool,
|
||||||
|
/// fetches the latest VPNBook OpenVPN configurations if VPN rotation is enabled,
|
||||||
/// and sequentially runs the full updates for corporate and economic data.
|
/// and sequentially runs the full updates for corporate and economic data.
|
||||||
/// Sequential execution helps prevent resource exhaustion from concurrent
|
/// Sequential execution helps prevent resource exhaustion from concurrent
|
||||||
/// chromedriver instances and avoids spamming the target websites with too many requests.
|
/// chromedriver instances and avoids spamming the target websites with too many requests.
|
||||||
@@ -20,8 +23,8 @@ use std::sync::Arc;
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
///
|
///
|
||||||
/// Returns an error if configuration loading fails, pool initialization fails,
|
/// Returns an error if configuration loading fails, pool initialization fails,
|
||||||
/// or if either update function encounters an issue (e.g., network errors,
|
/// VPN fetching fails (if enabled), or if either update function encounters an issue
|
||||||
/// scraping failures, or chromedriver spawn failures like "program not found").
|
/// (e.g., network errors, scraping failures, or chromedriver spawn failures like "program not found").
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
let config = Config::load().map_err(|err| {
|
let config = Config::load().map_err(|err| {
|
||||||
@@ -29,15 +32,48 @@ async fn main() -> Result<()> {
|
|||||||
err
|
err
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
// Initialize paths
|
||||||
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
|
// Initialize logger
|
||||||
|
logger::init_debug_logger(paths.logs_dir()).await.map_err(|e| {
|
||||||
|
anyhow::anyhow!("Logger initialization failed: {}", e)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
logger::log_info("=== Application started ===").await;
|
||||||
|
logger::log_info(&format!("Config: economic_start_date={}, corporate_start_date={}, lookahead_months={}, max_parallel_instances={}, enable_vpn_rotation={}",
|
||||||
|
config.economic_start_date, config.corporate_start_date, config.economic_lookahead_months, config.max_parallel_instances, config.enable_vpn_rotation)).await;
|
||||||
|
|
||||||
// Initialize the shared ChromeDriver pool once
|
// Initialize the shared ChromeDriver pool once
|
||||||
let pool_size = config.max_parallel_tasks;
|
let pool_size = config.max_parallel_instances;
|
||||||
|
logger::log_info(&format!("Initializing ChromeDriver pool with size: {}", pool_size)).await;
|
||||||
|
|
||||||
let pool = Arc::new(ChromeDriverPool::new(pool_size).await?);
|
let pool = Arc::new(ChromeDriverPool::new(pool_size).await?);
|
||||||
|
logger::log_info("✓ ChromeDriver pool initialized successfully").await;
|
||||||
|
|
||||||
|
// Fetch VPNBook configs if VPN rotation is enabled
|
||||||
|
if config.enable_vpn_rotation {
|
||||||
|
logger::log_info("--- Fetching latest VPNBook OpenVPN configurations ---").await;
|
||||||
|
let (username, password, files) =
|
||||||
|
util::opnv::fetch_vpnbook_configs(&pool, paths.cache_dir()).await?;
|
||||||
|
logger::log_info(&format!("Fetched VPN username: {}, password: {}", username, password)).await;
|
||||||
|
for file in &files {
|
||||||
|
logger::log_info(&format!("Extracted OVPN: {:?}", file)).await;
|
||||||
|
}
|
||||||
|
// Optionally, store username/password for rotation use (e.g., in a file or global state)
|
||||||
|
// For now, just log them; extend as needed for rotation integration
|
||||||
|
}
|
||||||
|
|
||||||
// Run economic update first, passing the shared pool
|
// Run economic update first, passing the shared pool
|
||||||
|
logger::log_info("--- Starting economic data update ---").await;
|
||||||
economic::run_full_update(&config, &pool).await?;
|
economic::run_full_update(&config, &pool).await?;
|
||||||
|
logger::log_info("✓ Economic data update completed").await;
|
||||||
|
|
||||||
// Then run corporate update, passing the shared pool
|
// Then run corporate update, passing the shared pool
|
||||||
|
logger::log_info("--- Starting corporate data update ---").await;
|
||||||
corporate::run_full_update(&config, &pool).await?;
|
corporate::run_full_update(&config, &pool).await?;
|
||||||
|
logger::log_info("✓ Corporate data update completed").await;
|
||||||
|
|
||||||
|
logger::log_info("=== Application completed successfully ===").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -3,13 +3,13 @@
|
|||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use fantoccini::{Client, ClientBuilder};
|
use fantoccini::{Client, ClientBuilder};
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
|
use std::pin::Pin;
|
||||||
use std::process::Stdio;
|
use std::process::Stdio;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||||
use tokio::process::{Child, Command};
|
use tokio::process::{Child, Command};
|
||||||
use tokio::sync::{Mutex, Semaphore};
|
use tokio::sync::{Mutex, Semaphore};
|
||||||
use tokio::time::{Duration, sleep, timeout};
|
use tokio::time::{sleep, timeout, Duration};
|
||||||
use std::pin::Pin;
|
|
||||||
|
|
||||||
/// Manages a pool of ChromeDriver instances for parallel scraping.
|
/// Manages a pool of ChromeDriver instances for parallel scraping.
|
||||||
///
|
///
|
||||||
@@ -19,6 +19,7 @@ use std::pin::Pin;
|
|||||||
pub struct ChromeDriverPool {
|
pub struct ChromeDriverPool {
|
||||||
instances: Vec<Arc<Mutex<ChromeInstance>>>,
|
instances: Vec<Arc<Mutex<ChromeInstance>>>,
|
||||||
semaphore: Arc<Semaphore>,
|
semaphore: Arc<Semaphore>,
|
||||||
|
tasks_per_instance: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeDriverPool {
|
impl ChromeDriverPool {
|
||||||
@@ -29,7 +30,10 @@ impl ChromeDriverPool {
|
|||||||
pub async fn new(pool_size: usize) -> Result<Self> {
|
pub async fn new(pool_size: usize) -> Result<Self> {
|
||||||
let mut instances = Vec::with_capacity(pool_size);
|
let mut instances = Vec::with_capacity(pool_size);
|
||||||
|
|
||||||
println!("Initializing ChromeDriver pool with {} instances...", pool_size);
|
println!(
|
||||||
|
"Initializing ChromeDriver pool with {} instances...",
|
||||||
|
pool_size
|
||||||
|
);
|
||||||
|
|
||||||
for i in 0..pool_size {
|
for i in 0..pool_size {
|
||||||
match ChromeInstance::new().await {
|
match ChromeInstance::new().await {
|
||||||
@@ -49,6 +53,7 @@ impl ChromeDriverPool {
|
|||||||
Ok(Self {
|
Ok(Self {
|
||||||
instances,
|
instances,
|
||||||
semaphore: Arc::new(Semaphore::new(pool_size)),
|
semaphore: Arc::new(Semaphore::new(pool_size)),
|
||||||
|
tasks_per_instance: 0,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,7 +65,10 @@ impl ChromeDriverPool {
|
|||||||
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
||||||
{
|
{
|
||||||
// Acquire semaphore permit
|
// Acquire semaphore permit
|
||||||
let _permit = self.semaphore.acquire().await
|
let _permit = self
|
||||||
|
.semaphore
|
||||||
|
.acquire()
|
||||||
|
.await
|
||||||
.map_err(|_| anyhow!("Semaphore closed"))?;
|
.map_err(|_| anyhow!("Semaphore closed"))?;
|
||||||
|
|
||||||
// Find an available instance (round-robin or first available)
|
// Find an available instance (round-robin or first available)
|
||||||
@@ -82,7 +90,7 @@ impl ChromeDriverPool {
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_number_of_instances (&self) -> usize {
|
pub fn get_number_of_instances(&self) -> usize {
|
||||||
self.instances.len()
|
self.instances.len()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -94,7 +102,7 @@ pub struct ChromeInstance {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeInstance {
|
impl ChromeInstance {
|
||||||
/// Creates a new ChromeInstance by spawning chromedriver with random port.
|
/// Creates a new ChromeInstance by spawning chromedriver with random port.
|
||||||
///
|
///
|
||||||
/// This spawns `chromedriver --port=0` to avoid port conflicts, reads stdout to extract
|
/// This spawns `chromedriver --port=0` to avoid port conflicts, reads stdout to extract
|
||||||
/// the listening address, and waits for the success message. If timeout occurs or
|
/// the listening address, and waits for the success message. If timeout occurs or
|
||||||
@@ -115,13 +123,11 @@ impl ChromeInstance {
|
|||||||
.spawn()
|
.spawn()
|
||||||
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
|
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
|
||||||
|
|
||||||
let mut stdout = BufReader::new(
|
let mut stdout =
|
||||||
process.stdout.take().context("Failed to capture stdout")?
|
BufReader::new(process.stdout.take().context("Failed to capture stdout")?).lines();
|
||||||
).lines();
|
|
||||||
|
|
||||||
let mut stderr = BufReader::new(
|
let mut stderr =
|
||||||
process.stderr.take().context("Failed to capture stderr")?
|
BufReader::new(process.stderr.take().context("Failed to capture stderr")?).lines();
|
||||||
).lines();
|
|
||||||
|
|
||||||
let start_time = std::time::Instant::now();
|
let start_time = std::time::Instant::now();
|
||||||
let mut address: Option<String> = None;
|
let mut address: Option<String> = None;
|
||||||
@@ -136,9 +142,7 @@ impl ChromeInstance {
|
|||||||
|
|
||||||
// Wait for address and success (up to 30s)
|
// Wait for address and success (up to 30s)
|
||||||
while start_time.elapsed() < Duration::from_secs(30) {
|
while start_time.elapsed() < Duration::from_secs(30) {
|
||||||
if let Ok(Ok(Some(line))) =
|
if let Ok(Ok(Some(line))) = timeout(Duration::from_secs(1), stdout.next_line()).await {
|
||||||
timeout(Duration::from_secs(1), stdout.next_line()).await
|
|
||||||
{
|
|
||||||
if let Some(addr) = parse_chromedriver_address(&line) {
|
if let Some(addr) = parse_chromedriver_address(&line) {
|
||||||
address = Some(addr.to_string());
|
address = Some(addr.to_string());
|
||||||
}
|
}
|
||||||
@@ -242,7 +246,9 @@ impl Drop for ChromeInstance {
|
|||||||
/// For backwards compatibility with existing code.
|
/// For backwards compatibility with existing code.
|
||||||
pub struct ScrapeTask<T> {
|
pub struct ScrapeTask<T> {
|
||||||
url: String,
|
url: String,
|
||||||
parse: Box<dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send>,
|
parse: Box<
|
||||||
|
dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send,
|
||||||
|
>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Send + 'static> ScrapeTask<T> {
|
impl<T: Send + 'static> ScrapeTask<T> {
|
||||||
@@ -262,8 +268,7 @@ impl<T: Send + 'static> ScrapeTask<T> {
|
|||||||
let url = self.url;
|
let url = self.url;
|
||||||
let parse = self.parse;
|
let parse = self.parse;
|
||||||
|
|
||||||
pool.execute(url, move |client| async move {
|
pool.execute(url, move |client| async move { (parse)(client).await })
|
||||||
(parse)(client).await
|
.await
|
||||||
}).await
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
22
src/util.rs
22
src/util.rs
@@ -1,22 +0,0 @@
|
|||||||
// src/util.rs (or put it directly in main.rs if you prefer)
|
|
||||||
use tokio::fs;
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
/// Create the required data folders if they do not exist yet.
|
|
||||||
pub async fn _ensure_data_dirs() -> anyhow::Result<()> {
|
|
||||||
let dirs = [
|
|
||||||
"economic_events",
|
|
||||||
"economic_event_changes",
|
|
||||||
"corporate_events",
|
|
||||||
"corporate_prices",
|
|
||||||
"data",
|
|
||||||
];
|
|
||||||
for dir in dirs {
|
|
||||||
let path = Path::new(dir);
|
|
||||||
if !path.exists() {
|
|
||||||
tokio::fs::create_dir_all(path).await?;
|
|
||||||
println!("Created directory: {dir}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
171
src/util/directories.rs
Normal file
171
src/util/directories.rs
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
use crate::util::opnv;
|
||||||
|
|
||||||
|
/// Central configuration for all data paths
|
||||||
|
pub struct DataPaths {
|
||||||
|
base_dir: PathBuf,
|
||||||
|
data_dir: PathBuf,
|
||||||
|
cache_dir: PathBuf,
|
||||||
|
logs_dir: PathBuf,
|
||||||
|
// Cache data subdirectories
|
||||||
|
cache_gleif_dir: PathBuf,
|
||||||
|
cache_openfigi_dir: PathBuf,
|
||||||
|
cache_gleif_openfigi_map_dir: PathBuf,
|
||||||
|
cache_openvpn_dir: PathBuf,
|
||||||
|
// Economic data subdirectories
|
||||||
|
economic_events_dir: PathBuf,
|
||||||
|
economic_changes_dir: PathBuf,
|
||||||
|
// Corporate data subdirectories
|
||||||
|
corporate_events_dir: PathBuf,
|
||||||
|
corporate_changes_dir: PathBuf,
|
||||||
|
corporate_prices_dir: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DataPaths {
|
||||||
|
/// Initialize paths from a base directory
|
||||||
|
pub fn new(base_dir: impl AsRef<Path>) -> std::io::Result<Self> {
|
||||||
|
let base_dir = base_dir.as_ref().to_path_buf();
|
||||||
|
|
||||||
|
let data_dir = base_dir.join("data");
|
||||||
|
let cache_dir = base_dir.join("cache");
|
||||||
|
let logs_dir = base_dir.join("logs");
|
||||||
|
|
||||||
|
// Cache subdirectories
|
||||||
|
let cache_gleif_dir = cache_dir.join("gleif");
|
||||||
|
let cache_openfigi_dir = cache_dir.join("openfigi");
|
||||||
|
let cache_gleif_openfigi_map_dir = cache_dir.join("glei_openfigi");
|
||||||
|
let cache_openvpn_dir = cache_dir.join("openvpn");
|
||||||
|
|
||||||
|
// Economic subdirectories
|
||||||
|
let economic_events_dir = data_dir.join("economic").join("events");
|
||||||
|
let economic_changes_dir = economic_events_dir.join("changes");
|
||||||
|
|
||||||
|
// Corporate subdirectories
|
||||||
|
let corporate_dir = data_dir.join("corporate");
|
||||||
|
let corporate_events_dir = corporate_dir.join("events");
|
||||||
|
let corporate_changes_dir = corporate_events_dir.join("changes");
|
||||||
|
let corporate_prices_dir = corporate_dir.join("prices");
|
||||||
|
|
||||||
|
// Create all directories if they don't exist
|
||||||
|
fs::create_dir_all(&data_dir)?;
|
||||||
|
fs::create_dir_all(&cache_dir)?;
|
||||||
|
fs::create_dir_all(&logs_dir)?;
|
||||||
|
fs::create_dir_all(&cache_gleif_dir)?;
|
||||||
|
fs::create_dir_all(&cache_openfigi_dir)?;
|
||||||
|
fs::create_dir_all(&cache_gleif_openfigi_map_dir)?;
|
||||||
|
fs::create_dir_all(&cache_openvpn_dir)?;
|
||||||
|
fs::create_dir_all(&economic_events_dir)?;
|
||||||
|
fs::create_dir_all(&economic_changes_dir)?;
|
||||||
|
fs::create_dir_all(&corporate_events_dir)?;
|
||||||
|
fs::create_dir_all(&corporate_changes_dir)?;
|
||||||
|
fs::create_dir_all(&corporate_prices_dir)?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
base_dir,
|
||||||
|
data_dir,
|
||||||
|
cache_dir,
|
||||||
|
logs_dir,
|
||||||
|
cache_gleif_dir,
|
||||||
|
cache_openfigi_dir,
|
||||||
|
cache_gleif_openfigi_map_dir,
|
||||||
|
cache_openvpn_dir,
|
||||||
|
economic_events_dir,
|
||||||
|
economic_changes_dir,
|
||||||
|
corporate_events_dir,
|
||||||
|
corporate_changes_dir,
|
||||||
|
corporate_prices_dir,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn base_dir(&self) -> &Path {
|
||||||
|
&self.base_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn data_dir(&self) -> &Path {
|
||||||
|
&self.data_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_dir(&self) -> &Path {
|
||||||
|
&self.cache_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn logs_dir(&self) -> &Path {
|
||||||
|
&self.logs_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_gleif_dir(&self) -> &Path {
|
||||||
|
&self.cache_gleif_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_openfigi_dir(&self) -> &Path {
|
||||||
|
&self.cache_openfigi_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_gleif_openfigi_map_dir(&self) -> &Path {
|
||||||
|
&self.cache_gleif_openfigi_map_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cache_openvpn_dir(&self) -> &Path {
|
||||||
|
&self.cache_openvpn_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the economic events directory
|
||||||
|
pub fn economic_events_dir(&self) -> &Path {
|
||||||
|
&self.economic_events_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the economic changes directory
|
||||||
|
pub fn economic_changes_dir(&self) -> &Path {
|
||||||
|
&self.economic_changes_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the corporate events directory
|
||||||
|
pub fn corporate_events_dir(&self) -> &Path {
|
||||||
|
&self.corporate_events_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the corporate changes directory
|
||||||
|
pub fn corporate_changes_dir(&self) -> &Path {
|
||||||
|
&self.corporate_changes_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the corporate prices directory
|
||||||
|
pub fn corporate_prices_dir(&self) -> &Path {
|
||||||
|
&self.corporate_prices_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a specific file path within data directory
|
||||||
|
pub fn data_file(&self, filename: &str) -> PathBuf {
|
||||||
|
self.data_dir.join(filename)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a specific file path within cache directory
|
||||||
|
pub fn cache_file(&self, filename: &str) -> PathBuf {
|
||||||
|
self.cache_dir.join(filename)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a specific file path within logs directory
|
||||||
|
pub fn log_file(&self, filename: &str) -> PathBuf {
|
||||||
|
self.logs_dir.join(filename)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_paths_creation() {
|
||||||
|
let paths = DataPaths::new("./test_base").unwrap();
|
||||||
|
assert!(paths.data_dir().exists());
|
||||||
|
assert!(paths.cache_dir().exists());
|
||||||
|
assert!(paths.logs_dir().exists());
|
||||||
|
assert!(paths.economic_events_dir().exists());
|
||||||
|
assert!(paths.economic_changes_dir().exists());
|
||||||
|
assert!(paths.corporate_events_dir().exists());
|
||||||
|
assert!(paths.corporate_changes_dir().exists());
|
||||||
|
assert!(paths.corporate_prices_dir().exists());
|
||||||
|
}
|
||||||
|
}
|
||||||
78
src/util/logger.rs
Normal file
78
src/util/logger.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
// src/util/logger.rs
|
||||||
|
use chrono::Local;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
use std::fs::{self, OpenOptions};
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
static LOGGER: Lazy<Mutex<Option<DebugLogger>>> = Lazy::new(|| Mutex::new(None));
|
||||||
|
|
||||||
|
pub struct DebugLogger {
|
||||||
|
file: std::fs::File,
|
||||||
|
log_path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DebugLogger {
|
||||||
|
fn new(log_dir: &std::path::Path) -> std::io::Result<Self> {
|
||||||
|
|
||||||
|
fs::create_dir_all(log_dir)?;
|
||||||
|
let filename = format!("backtest_{}.log", Local::now().format("%Y%m%d_%H%M%S"));
|
||||||
|
let log_path = log_dir.join(&filename);
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)?;
|
||||||
|
Ok(Self { file, log_path })
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn log(&mut self, msg: &str) {
|
||||||
|
let line = format!("[{}] {}\n", Local::now().format("%H:%M:%S"), msg);
|
||||||
|
let _ = self.file.write_all(line.as_bytes());
|
||||||
|
let _ = self.file.flush();
|
||||||
|
println!("{}", line.trim_end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn init_debug_logger(log_dir: &std::path::Path) -> Result<(), String> {
|
||||||
|
let mut logger = LOGGER.lock().await;
|
||||||
|
match DebugLogger::new(log_dir) {
|
||||||
|
Ok(l) => {
|
||||||
|
let log_path = l.log_path.clone();
|
||||||
|
*logger = Some(l);
|
||||||
|
println!("✓ Logger initialized at: {:?}", log_path);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let err_msg = format!("Failed to initialize logger: {}", e);
|
||||||
|
eprintln!("{}", err_msg);
|
||||||
|
Err(err_msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_message(msg: &str) {
|
||||||
|
let mut logger = LOGGER.lock().await;
|
||||||
|
if let Some(l) = logger.as_mut() {
|
||||||
|
l.log(msg).await;
|
||||||
|
} else {
|
||||||
|
println!("[LOG] {}", msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_detailed(level: &str, msg: &str) {
|
||||||
|
let formatted = format!("[{}] {}", level, msg);
|
||||||
|
log_message(&formatted).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_info(msg: &str) {
|
||||||
|
log_detailed("INFO", msg).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_warn(msg: &str) {
|
||||||
|
log_detailed("WARN", msg).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn log_error(msg: &str) {
|
||||||
|
log_detailed("ERROR", msg).await;
|
||||||
|
}
|
||||||
4
src/util/mod.rs
Normal file
4
src/util/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
// src/util/mod.rs
|
||||||
|
pub mod logger;
|
||||||
|
pub mod directories;
|
||||||
|
pub mod opnv;
|
||||||
281
src/util/opnv.rs
Normal file
281
src/util/opnv.rs
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
// src/scraper/opnv.rs
|
||||||
|
|
||||||
|
//! Module for fetching, downloading, and extracting OpenVPN configurations from VPNBook.
|
||||||
|
//!
|
||||||
|
//! This module provides functionality to scrape the VPNBook free VPN page using
|
||||||
|
//! a headless browser, handle potential consent popups, extract current credentials,
|
||||||
|
//! collect download URLs for OpenVPN ZIP files, download them, and then extract
|
||||||
|
//! the .ovpn files into a structured directory: cache/openvpn/<hostname>/<ovpn_filename>.
|
||||||
|
//! It is designed to fetch the most recent data on every run, as credentials and
|
||||||
|
//! server configurations change periodically.
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use fantoccini::{Client, Locator};
|
||||||
|
use reqwest;
|
||||||
|
use std::io::{self, Read};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use tokio::fs::File;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use url::Url;
|
||||||
|
use zip::ZipArchive;
|
||||||
|
use crate::scraper::webdriver::{ChromeDriverPool, ScrapeTask};
|
||||||
|
use crate::util::{logger, directories::DataPaths};
|
||||||
|
|
||||||
|
/// Fetches, downloads, and extracts the latest OpenVPN configurations from VPNBook.
|
||||||
|
///
|
||||||
|
/// This asynchronous function uses the provided `ChromeDriverPool` to scrape the
|
||||||
|
/// VPNBook free VPN page. It dismisses any consent popup if present, extracts the
|
||||||
|
/// current username and password, collects all OpenVPN ZIP download URLs, downloads
|
||||||
|
/// the ZIP files temporarily, extracts the .ovpn files into the specified directory
|
||||||
|
/// structure under `cache_dir`/openvpn/<hostname>/, and cleans up the ZIP files.
|
||||||
|
///
|
||||||
|
/// The directory structure is: cache/openvpn/<hostname>/<ovpn_filename>, where
|
||||||
|
/// <hostname> is derived from the ZIP filename (e.g., "ca149.vpnbook.com").
|
||||||
|
///
|
||||||
|
/// The function ensures fresh data is fetched each time it runs, making it suitable
|
||||||
|
/// for periodic updates where credentials may change.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `pool` - A reference to the `ChromeDriverPool` for managing browser instances.
|
||||||
|
/// * `cache_dir` - The path to the base cache directory. The OpenVPN files will be saved
|
||||||
|
/// under `cache_dir`/openvpn/<hostname>/.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A `Result` containing a tuple with:
|
||||||
|
/// - `String`: The scraped username.
|
||||||
|
/// - `String`: The scraped password.
|
||||||
|
/// - `Vec<PathBuf>`: Paths to the extracted .ovpn files.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
///
|
||||||
|
/// Returns an `anyhow::Error` if:
|
||||||
|
/// - Navigation to the page fails.
|
||||||
|
/// - The consent popup cannot be dismissed (if present).
|
||||||
|
/// - Credentials cannot be parsed from the page.
|
||||||
|
/// - Download URLs cannot be found or are invalid.
|
||||||
|
/// - HTTP downloads fail or file writing errors occur.
|
||||||
|
/// - ZIP extraction fails (e.g., invalid ZIP or I/O errors).
|
||||||
|
///
|
||||||
|
/// # Dependencies
|
||||||
|
///
|
||||||
|
/// This function requires the following crates (add to Cargo.toml if not present):
|
||||||
|
/// - `anyhow` for error handling.
|
||||||
|
/// - `fantoccini` for browser automation.
|
||||||
|
/// - `reqwest` (with `tokio` features) for HTTP downloads.
|
||||||
|
/// - `tokio` for asynchronous file operations.
|
||||||
|
/// - `url` for URL manipulation.
|
||||||
|
/// - `zip` for ZIP extraction.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```no_run
|
||||||
|
/// use anyhow::Result;
|
||||||
|
/// use event_backtest_engine::scraper::opnv::fetch_vpnbook_configs;
|
||||||
|
/// use event_backtest_engine::scraper::webdriver::ChromeDriverPool;
|
||||||
|
/// use std::path::Path;
|
||||||
|
///
|
||||||
|
/// #[tokio::main]
|
||||||
|
/// async fn main() -> Result<()> {
|
||||||
|
/// let pool = ChromeDriverPool::new(1).await?;
|
||||||
|
/// let (username, password, files) =
|
||||||
|
/// fetch_vpnbook_configs(&pool, Path::new("./cache")).await?;
|
||||||
|
/// println!("Username: {}, Password: {}", username, password);
|
||||||
|
/// for file in files {
|
||||||
|
/// println!("Extracted: {:?}", file);
|
||||||
|
/// }
|
||||||
|
/// Ok(())
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
pub async fn fetch_vpnbook_configs(
|
||||||
|
pool: &ChromeDriverPool,
|
||||||
|
cache_dir: &Path,
|
||||||
|
) -> Result<(String, String, Vec<PathBuf>)> {
|
||||||
|
// Prepare the openvpn directory
|
||||||
|
let dir = DataPaths::new(".")?;
|
||||||
|
let vpn_dir = dir.cache_openvpn_dir();
|
||||||
|
tokio::fs::create_dir_all(&vpn_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to create openvpn directory")?;
|
||||||
|
|
||||||
|
// Temporary directory for ZIP downloads (under cache for consistency)
|
||||||
|
let temp_dir = cache_dir.join("temp_vpn_zips");
|
||||||
|
tokio::fs::create_dir_all(&temp_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to create temp directory")?;
|
||||||
|
|
||||||
|
let url = "https://www.vpnbook.com/freevpn".to_string();
|
||||||
|
|
||||||
|
// Define the scraping task
|
||||||
|
let task = ScrapeTask::new(url, |client: Client| async move {
|
||||||
|
// Attempt to dismiss consent popup if present
|
||||||
|
let consent_selector = r#"body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-do-not-consent.fc-secondary-button > p"#;
|
||||||
|
if let Ok(consent_elem) = client.find(Locator::Css(consent_selector)).await {
|
||||||
|
consent_elem
|
||||||
|
.click()
|
||||||
|
.await
|
||||||
|
.context("Failed to click consent dismissal button")?;
|
||||||
|
// Brief delay to allow popup to close
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find all <code> elements
|
||||||
|
let codes = client
|
||||||
|
.find_all(Locator::Css("code"))
|
||||||
|
.await
|
||||||
|
.context("Failed to find code elements")?;
|
||||||
|
|
||||||
|
if codes.len() < 2 {
|
||||||
|
return Err(anyhow!("Insufficient code elements found for credentials"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The first <code> is username, second is password
|
||||||
|
let username = codes[0]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.context("Failed to get username text")?;
|
||||||
|
|
||||||
|
let password = codes[1]
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.context("Failed to get password text")?;
|
||||||
|
|
||||||
|
// Locate all download links for OpenVPN ZIP files
|
||||||
|
let links = client
|
||||||
|
.find_all(Locator::Css(r#"a[href^="/free-openvpn-account/"][download=""]"#))
|
||||||
|
.await
|
||||||
|
.context("Failed to find download links")?;
|
||||||
|
|
||||||
|
// Collect relative hrefs
|
||||||
|
let mut rel_urls = Vec::new();
|
||||||
|
for link in links {
|
||||||
|
if let Some(href) = link.attr("href").await.context("Failed to get href attribute")? {
|
||||||
|
rel_urls.push(href);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((username, password, rel_urls))
|
||||||
|
});
|
||||||
|
|
||||||
|
// Execute the scraping task using the pool
|
||||||
|
let (username, password, rel_urls) = task.execute_with_pool(pool).await?;
|
||||||
|
|
||||||
|
// Base URL for resolving relative paths
|
||||||
|
let base_url = Url::parse("https://www.vpnbook.com/")?;
|
||||||
|
|
||||||
|
// Download each ZIP file to temp_dir
|
||||||
|
let mut zip_paths = Vec::new();
|
||||||
|
for rel in &rel_urls {
|
||||||
|
let full_url = base_url.join(rel).context("Failed to join URL")?;
|
||||||
|
let filename = rel
|
||||||
|
.split('/')
|
||||||
|
.last()
|
||||||
|
.ok_or_else(|| anyhow!("Invalid filename in URL"))?
|
||||||
|
.to_string();
|
||||||
|
let out_path = temp_dir.join(&filename);
|
||||||
|
|
||||||
|
// Perform HTTP GET request
|
||||||
|
let resp = reqwest::get(full_url.clone())
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("Failed to send download request for {}", full_url))?;
|
||||||
|
|
||||||
|
if resp.status().is_success() {
|
||||||
|
let bytes = resp
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.context("Failed to read response bytes")?;
|
||||||
|
|
||||||
|
// Write to file asynchronously
|
||||||
|
let mut file = File::create(&out_path)
|
||||||
|
.await
|
||||||
|
.context("Failed to create output file")?;
|
||||||
|
file.write_all(&bytes)
|
||||||
|
.await
|
||||||
|
.context("Failed to write to file")?;
|
||||||
|
|
||||||
|
zip_paths.push(out_path);
|
||||||
|
} else {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Download failed with status: {} for URL: {}",
|
||||||
|
resp.status(),
|
||||||
|
full_url
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now extract .ovpn files from each ZIP
|
||||||
|
let mut extracted_paths = Vec::new();
|
||||||
|
for zip_path in zip_paths {
|
||||||
|
let hostname = get_hostname_from_zip_filename(
|
||||||
|
zip_path.file_name().unwrap().to_str().unwrap(),
|
||||||
|
);
|
||||||
|
let hostname_dir = vpn_dir.join(&hostname);
|
||||||
|
tokio::fs::create_dir_all(&hostname_dir)
|
||||||
|
.await
|
||||||
|
.context("Failed to create hostname directory")?;
|
||||||
|
|
||||||
|
// Use spawn_blocking for sync ZIP operations
|
||||||
|
let zip_path_clone = zip_path.clone();
|
||||||
|
let hostname_dir_clone = hostname_dir.clone();
|
||||||
|
let extract_result = tokio::task::spawn_blocking(move || {
|
||||||
|
let file = std::fs::File::open(&zip_path_clone)
|
||||||
|
.with_context(|| format!("Failed to open ZIP file: {:?}", zip_path_clone))?;
|
||||||
|
let mut archive = ZipArchive::new(file)
|
||||||
|
.with_context(|| format!("Failed to read ZIP archive: {:?}", zip_path_clone))?;
|
||||||
|
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
for i in 0..archive.len() {
|
||||||
|
let mut zip_file = archive.by_index(i)?;
|
||||||
|
if zip_file.name().ends_with(".ovpn") {
|
||||||
|
// Get just the filename, stripping any path
|
||||||
|
let file_name = Path::new(zip_file.name()).file_name()
|
||||||
|
.ok_or_else(|| anyhow!("Invalid file name in ZIP: {}", zip_file.name()))?
|
||||||
|
.to_str()
|
||||||
|
.ok_or_else(|| anyhow!("Invalid UTF-8 in file name: {}", zip_file.name()))?
|
||||||
|
.to_string();
|
||||||
|
let target_path = hostname_dir_clone.join(file_name);
|
||||||
|
let mut content = Vec::new();
|
||||||
|
zip_file.read_to_end(&mut content)?;
|
||||||
|
|
||||||
|
std::fs::write(&target_path, &content)
|
||||||
|
.with_context(|| format!("Failed to write .ovpn file: {:?}", target_path))?;
|
||||||
|
paths.push(target_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok::<Vec<PathBuf>, anyhow::Error>(paths)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.context("Spawn blocking failed")??;
|
||||||
|
|
||||||
|
extracted_paths.extend(extract_result);
|
||||||
|
|
||||||
|
// Clean up the ZIP file after extraction
|
||||||
|
tokio::fs::remove_file(&zip_path)
|
||||||
|
.await
|
||||||
|
.context("Failed to remove temp ZIP file")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optional: Clean up temp_dir if empty
|
||||||
|
let _ = tokio::fs::remove_dir(&temp_dir).await;
|
||||||
|
|
||||||
|
Ok((username, password, extracted_paths))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Derives the hostname from the ZIP filename.
|
||||||
|
///
|
||||||
|
/// For example, "vpnbook-openvpn-ca149.zip" -> "ca149.vpnbook.com"
|
||||||
|
///
|
||||||
|
/// If the format doesn't match, returns "unknown.vpnbook.com".
|
||||||
|
fn get_hostname_from_zip_filename(filename: &str) -> String {
|
||||||
|
if filename.starts_with("vpnbook-openvpn-") && filename.ends_with(".zip") {
|
||||||
|
let code = filename
|
||||||
|
.strip_prefix("vpnbook-openvpn-")
|
||||||
|
.unwrap()
|
||||||
|
.strip_suffix(".zip")
|
||||||
|
.unwrap();
|
||||||
|
format!("{}.vpnbook.com", code)
|
||||||
|
} else {
|
||||||
|
"unknown.vpnbook.com".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user