added update_rule for incremental change
This commit is contained in:
25
data_updating_rule.md
Normal file
25
data_updating_rule.md
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Abort-Safe Incremental JSONL Persistence Rule
|
||||||
|
|
||||||
|
**Rule:** Persist state using an *append-only, fsync-backed JSONL log with atomic checkpoints*.
|
||||||
|
|
||||||
|
**Requirements**
|
||||||
|
- Write updates as **single-line JSON objects** (one logical mutation per line).
|
||||||
|
- **Append only** (`O_APPEND`), never modify existing lines.
|
||||||
|
- After each write batch, call **`fsync`** (or `File::sync_data`) before reporting success.
|
||||||
|
- Treat a **line as committed only if it ends with `\n`**; ignore trailing partial lines on recovery.
|
||||||
|
- Periodically create a **checkpoint**:
|
||||||
|
- Write full state to `state.tmp`
|
||||||
|
- `fsync`
|
||||||
|
- **Atomic rename** to `state.jsonl`
|
||||||
|
- On startup:
|
||||||
|
- Load last checkpoint
|
||||||
|
- Replay log lines after it in order
|
||||||
|
- On abort/panic/crash:
|
||||||
|
- No truncation
|
||||||
|
- Replay guarantees no data loss beyond last fsynced line
|
||||||
|
|
||||||
|
**Outcome**
|
||||||
|
- Crash/abort-safe
|
||||||
|
- O(1) writes
|
||||||
|
- Deterministic recovery
|
||||||
|
- Minimal overhead
|
||||||
@@ -21,21 +21,12 @@ pub struct Config {
|
|||||||
/// If set to "true", enables automatic VPN rotation between sessions
|
/// If set to "true", enables automatic VPN rotation between sessions
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub enable_vpn_rotation: bool,
|
pub enable_vpn_rotation: bool,
|
||||||
|
|
||||||
/// Number of tasks per session before rotating VPN
|
|
||||||
/// If set to 0, rotates VPN between economic and corporate phases
|
|
||||||
#[serde(default = "default_tasks_per_session")]
|
|
||||||
pub tasks_per_vpn_session: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_max_parallel_instances() -> usize {
|
fn default_max_parallel_instances() -> usize {
|
||||||
10
|
10
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_tasks_per_session() -> usize {
|
|
||||||
0 // 0 = rotate between economic/corporate
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -45,7 +36,6 @@ impl Default for Config {
|
|||||||
max_parallel_instances: default_max_parallel_instances(),
|
max_parallel_instances: default_max_parallel_instances(),
|
||||||
max_tasks_per_instance: 0,
|
max_tasks_per_instance: 0,
|
||||||
enable_vpn_rotation: false,
|
enable_vpn_rotation: false,
|
||||||
tasks_per_vpn_session: default_tasks_per_session(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -93,11 +83,6 @@ impl Config {
|
|||||||
.parse::<bool>()
|
.parse::<bool>()
|
||||||
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
.context("Failed to parse ENABLE_VPN_ROTATION as bool")?;
|
||||||
|
|
||||||
let tasks_per_vpn_session: usize = dotenvy::var("TASKS_PER_VPN_SESSION")
|
|
||||||
.unwrap_or_else(|_| "0".to_string())
|
|
||||||
.parse()
|
|
||||||
.context("Failed to parse TASKS_PER_VPN_SESSION as usize")?;
|
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
economic_start_date,
|
economic_start_date,
|
||||||
corporate_start_date,
|
corporate_start_date,
|
||||||
@@ -105,7 +90,6 @@ impl Config {
|
|||||||
max_parallel_instances,
|
max_parallel_instances,
|
||||||
max_tasks_per_instance,
|
max_tasks_per_instance,
|
||||||
enable_vpn_rotation,
|
enable_vpn_rotation,
|
||||||
tasks_per_vpn_session,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -79,10 +79,19 @@ pub struct CompanyInfo{
|
|||||||
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
pub securities: HashMap<String, Vec<FigiInfo>>, // ISIN -> Vec<FigiInfo>
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct YahooCompanyDetails {
|
||||||
|
pub ticker: String,
|
||||||
|
pub sector: Option<String>,
|
||||||
|
pub exchange: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct CompanyCrossPlatformInfo {
|
pub struct CompanyCrossPlatformInfo {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub isin_tickers_map: HashMap<String, Vec<String>>, // ISIN -> Tickers
|
pub isin_tickers_map: HashMap<String, Vec<String>>, // ISIN -> Tickers
|
||||||
|
pub sector: Option<String>,
|
||||||
|
pub exchange: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Warrant Info
|
/// Warrant Info
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
// src/corporate/update.rs - COMPLETE STREAMING VERSION
|
// src/corporate/update.rs - ABORT-SAFE VERSION WITH JSONL LOG
|
||||||
|
|
||||||
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
|
use super::{scraper::*, storage::*, helpers::*, types::*, openfigi::*, yahoo::*};
|
||||||
use crate::config::Config;
|
use crate::config::Config;
|
||||||
@@ -9,14 +9,17 @@ use crate::scraper::webdriver::ChromeDriverPool;
|
|||||||
use chrono::Local;
|
use chrono::Local;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
|
||||||
/// Main update function - fully streaming, minimal memory usage
|
pub async fn run_full_update(
|
||||||
pub async fn run_full_update(_config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> {
|
_config: &Config,
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
logger::log_info("=== Corporate Update (STREAMING MODE) ===").await;
|
logger::log_info("=== Corporate Update (STREAMING MODE) ===").await;
|
||||||
|
|
||||||
let paths = DataPaths::new(".")?;
|
let paths = DataPaths::new(".")?;
|
||||||
|
|
||||||
// Step 1: Download GLEIF CSV (don't load into memory)
|
|
||||||
logger::log_info("Step 1: Downloading GLEIF CSV...").await;
|
logger::log_info("Step 1: Downloading GLEIF CSV...").await;
|
||||||
let gleif_csv_path = match download_isin_lei_csv().await? {
|
let gleif_csv_path = match download_isin_lei_csv().await? {
|
||||||
Some(p) => {
|
Some(p) => {
|
||||||
@@ -29,14 +32,19 @@ pub async fn run_full_update(_config: &Config, pool: &Arc<ChromeDriverPool>) ->
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Step 2: Load OpenFIGI type lists (small, cached)
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
logger::log_info("Step 2: Loading OpenFIGI metadata...").await;
|
logger::log_info("Step 2: Loading OpenFIGI metadata...").await;
|
||||||
load_figi_type_lists().await.ok();
|
load_figi_type_lists().await.ok();
|
||||||
logger::log_info(" ✓ OpenFIGI metadata loaded").await;
|
logger::log_info(" ✓ OpenFIGI metadata loaded").await;
|
||||||
|
|
||||||
// Step 3: Check mapping status and process only unmapped LEIs
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
logger::log_info("Step 3: Checking LEI-FIGI mapping status...").await;
|
logger::log_info("Step 3: Checking LEI-FIGI mapping status...").await;
|
||||||
|
|
||||||
let all_mapped = ensure_all_leis_mapped(&gleif_csv_path, None).await?;
|
let all_mapped = ensure_all_leis_mapped(&gleif_csv_path, None).await?;
|
||||||
|
|
||||||
if !all_mapped {
|
if !all_mapped {
|
||||||
@@ -45,7 +53,10 @@ pub async fn run_full_update(_config: &Config, pool: &Arc<ChromeDriverPool>) ->
|
|||||||
logger::log_info(" ✓ All LEIs successfully mapped").await;
|
logger::log_info(" ✓ All LEIs successfully mapped").await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 4: Build securities from FIGI data (streaming)
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
logger::log_info("Step 4: Building securities map (streaming)...").await;
|
logger::log_info("Step 4: Building securities map (streaming)...").await;
|
||||||
let date_dir = find_most_recent_figi_date_dir(&paths).await?;
|
let date_dir = find_most_recent_figi_date_dir(&paths).await?;
|
||||||
|
|
||||||
@@ -57,22 +68,37 @@ pub async fn run_full_update(_config: &Config, pool: &Arc<ChromeDriverPool>) ->
|
|||||||
logger::log_warn(" ✗ No FIGI data directory found").await;
|
logger::log_warn(" ✗ No FIGI data directory found").await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 5: Build companies JSONL (streaming from securities)
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
logger::log_info("Step 5: Building companies.jsonl (streaming)...").await;
|
return Ok(());
|
||||||
let count = build_companies_jsonl_streaming(&paths, pool).await?;
|
}
|
||||||
|
|
||||||
|
logger::log_info("Step 5: Building companies.jsonl (streaming with abort-safe persistence)...").await;
|
||||||
|
let count = build_companies_jsonl_streaming(&paths, pool, shutdown_flag).await?;
|
||||||
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
logger::log_info(&format!(" ✓ Saved {} companies", count)).await;
|
||||||
|
|
||||||
// Step 6: Process events (using index, not full load)
|
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
logger::log_info("Step 6: Processing events (using index)...").await;
|
logger::log_info("Step 6: Processing events (using index)...").await;
|
||||||
let _event_index = build_event_index(&paths).await?;
|
let _event_index = build_event_index(&paths).await?;
|
||||||
logger::log_info(" ✓ Event index built").await;
|
logger::log_info(" ✓ Event index built").await;
|
||||||
|
}
|
||||||
|
|
||||||
logger::log_info("✓ Corporate update complete").await;
|
logger::log_info("✓ Corporate update complete").await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Stream companies.jsonl creation from securities cache - INCREMENTAL MODE
|
/// Abort-safe incremental JSONL persistence with atomic checkpoints
|
||||||
async fn build_companies_jsonl_streaming(paths: &DataPaths, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<usize> {
|
///
|
||||||
|
/// Implements the data_updating_rule.md specification:
|
||||||
|
/// - Append-only JSONL log for all updates
|
||||||
|
/// - fsync after each write batch
|
||||||
|
/// - Atomic checkpoints via temp file + rename
|
||||||
|
/// - Crash recovery by loading checkpoint + replaying log
|
||||||
|
/// - Partial lines ignored during recovery
|
||||||
|
async fn build_companies_jsonl_streaming(
|
||||||
|
paths: &DataPaths,
|
||||||
|
pool: &Arc<ChromeDriverPool>,
|
||||||
|
shutdown_flag: &Arc<AtomicBool>,
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
let path = DataPaths::new(".")?;
|
let path = DataPaths::new(".")?;
|
||||||
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
let corporate_path = path.data_dir().join("corporate").join("by_name");
|
||||||
let securities_path = corporate_path.join("common_stocks.json");
|
let securities_path = corporate_path.join("common_stocks.json");
|
||||||
@@ -82,59 +108,116 @@ async fn build_companies_jsonl_streaming(paths: &DataPaths, pool: &Arc<ChromeDri
|
|||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load securities
|
|
||||||
let content = tokio::fs::read_to_string(securities_path).await?;
|
let content = tokio::fs::read_to_string(securities_path).await?;
|
||||||
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
|
let securities: HashMap<String, CompanyInfo> = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
let companies_path = paths.data_dir().join("companies.jsonl");
|
let companies_path = paths.data_dir().join("companies.jsonl");
|
||||||
|
let log_path = paths.data_dir().join("companies_updates.log");
|
||||||
|
|
||||||
if let Some(parent) = companies_path.parent() {
|
if let Some(parent) = companies_path.parent() {
|
||||||
tokio::fs::create_dir_all(parent).await?;
|
tokio::fs::create_dir_all(parent).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load existing companies into a map
|
// === RECOVERY PHASE 1: Load last checkpoint ===
|
||||||
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||||
|
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||||
|
|
||||||
if companies_path.exists() {
|
if companies_path.exists() {
|
||||||
logger::log_info("Loading existing companies.jsonl...").await;
|
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
||||||
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
||||||
for line in existing_content.lines() {
|
for line in existing_content.lines() {
|
||||||
if line.trim().is_empty() {
|
if line.trim().is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
// Only process complete lines (ending with proper JSON closing brace)
|
||||||
|
// This ensures we don't process partial writes from crashed processes
|
||||||
|
if !line.ends_with('}') {
|
||||||
|
logger::log_warn(&format!("Skipping incomplete checkpoint line: {}", &line[..line.len().min(50)])).await;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||||
Ok(company) => {
|
Ok(company) => {
|
||||||
|
processed_names.insert(company.name.clone());
|
||||||
existing_companies.insert(company.name.clone(), company);
|
existing_companies.insert(company.name.clone(), company);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
logger::log_warn(&format!("Failed to parse existing company line: {}", e)).await;
|
logger::log_warn(&format!("Failed to parse checkpoint line: {}", e)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger::log_info(&format!("Loaded {} existing companies", existing_companies.len())).await;
|
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create temporary file for atomic write
|
// === RECOVERY PHASE 2: Replay log after checkpoint ===
|
||||||
let temp_path = companies_path.with_extension("jsonl.tmp");
|
if log_path.exists() {
|
||||||
let mut file = tokio::fs::File::create(&temp_path).await?;
|
logger::log_info("Replaying update log...").await;
|
||||||
let mut count = 0;
|
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||||
|
let mut replayed = 0;
|
||||||
|
for line in log_content.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Only replay complete lines (crash-safe: incomplete lines are ignored)
|
||||||
|
// A line is considered complete only if it ends with '\n' and valid JSON
|
||||||
|
if !line.ends_with('}') {
|
||||||
|
logger::log_warn(&format!("Skipping incomplete log line: {}", &line[..line.len().min(50)])).await;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||||
|
Ok(company) => {
|
||||||
|
processed_names.insert(company.name.clone());
|
||||||
|
existing_companies.insert(company.name.clone(), company);
|
||||||
|
replayed += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
logger::log_warn(&format!("Failed to parse log line: {}", e)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if replayed > 0 {
|
||||||
|
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// === APPEND-ONLY LOG: Open in append mode with O_APPEND semantics ===
|
||||||
|
use tokio::fs::OpenOptions;
|
||||||
|
let mut log_file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true) // O_APPEND - atomic append operations
|
||||||
|
.open(&log_path)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut count = existing_companies.len();
|
||||||
let mut updated_count = 0;
|
let mut updated_count = 0;
|
||||||
let mut new_count = 0;
|
let mut new_count = 0;
|
||||||
|
let checkpoint_interval = 50; // Create atomic checkpoint every 50 updates
|
||||||
|
let mut updates_since_checkpoint = 0;
|
||||||
|
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
|
|
||||||
for (name, company_info) in securities.iter() {
|
for (name, company_info) in securities.iter() {
|
||||||
// Check if we already have this company
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
let existing_entry = existing_companies.remove(name);
|
logger::log_info("Shutdown requested - stopping company processing").await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip if already processed (from checkpoint or log replay)
|
||||||
|
if processed_names.contains(name) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let existing_entry = existing_companies.get(name).cloned();
|
||||||
let is_update = existing_entry.is_some();
|
let is_update = existing_entry.is_some();
|
||||||
|
|
||||||
// Start with existing ISIN-ticker map or create new one
|
|
||||||
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
let mut isin_tickers_map: HashMap<String, Vec<String>> =
|
||||||
existing_entry
|
existing_entry
|
||||||
.map(|e| e.isin_tickers_map)
|
.as_ref()
|
||||||
|
.map(|e| e.isin_tickers_map.clone())
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
// Step 1: Extract unique ISIN-ticker pairs from FigiInfo
|
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
|
||||||
|
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
|
||||||
|
|
||||||
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
|
||||||
|
|
||||||
for figi_infos in company_info.securities.values() {
|
for figi_infos in company_info.securities.values() {
|
||||||
@@ -144,7 +227,6 @@ async fn build_companies_jsonl_streaming(paths: &DataPaths, pool: &Arc<ChromeDri
|
|||||||
.entry(figi_info.isin.clone())
|
.entry(figi_info.isin.clone())
|
||||||
.or_insert_with(Vec::new);
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
// Add FIGI ticker if present and not duplicate
|
|
||||||
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
if !figi_info.ticker.is_empty() && !tickers.contains(&figi_info.ticker) {
|
||||||
tickers.push(figi_info.ticker.clone());
|
tickers.push(figi_info.ticker.clone());
|
||||||
}
|
}
|
||||||
@@ -152,79 +234,129 @@ async fn build_companies_jsonl_streaming(paths: &DataPaths, pool: &Arc<ChromeDri
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 2: Merge FIGI tickers into main map
|
|
||||||
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
for (isin, figi_tickers) in unique_isin_ticker_pairs {
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
let tickers = isin_tickers_map
|
let tickers = isin_tickers_map
|
||||||
.entry(isin.clone())
|
.entry(isin.clone())
|
||||||
.or_insert_with(Vec::new);
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
// Add FIGI tickers that aren't already present
|
|
||||||
for figi_ticker in figi_tickers {
|
for figi_ticker in figi_tickers {
|
||||||
if !tickers.contains(&figi_ticker) {
|
if !tickers.contains(&figi_ticker) {
|
||||||
tickers.push(figi_ticker);
|
tickers.push(figi_ticker);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 3: Check if we need to fetch Yahoo ticker for this ISIN
|
|
||||||
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
let has_yahoo_ticker = tickers.iter().any(|t| t.starts_with("YAHOO:"));
|
||||||
|
|
||||||
if !has_yahoo_ticker {
|
if !has_yahoo_ticker && !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
logger::log_info(&format!("Fetching Yahoo ticker for {} (ISIN: {})", name, isin)).await;
|
logger::log_info(&format!("Fetching Yahoo details for {} (ISIN: {})", name, isin)).await;
|
||||||
let yahoo_result = scrape_ticker_by_isin(pool, &isin).await;
|
|
||||||
|
|
||||||
match yahoo_result {
|
match scrape_company_details_by_isin(pool, &isin).await {
|
||||||
Ok(result) => {
|
Ok(Some(details)) => {
|
||||||
let log_msg = match &result {
|
logger::log_info(&format!("✓ Found Yahoo ticker {} for ISIN {}", details.ticker, isin)).await;
|
||||||
YahooTickerResult::Found(ticker) =>
|
|
||||||
format!("✓ Found Yahoo ticker {} for ISIN {}", ticker, isin),
|
|
||||||
YahooTickerResult::NoResults =>
|
|
||||||
format!("○ No search results for ISIN {}", isin),
|
|
||||||
YahooTickerResult::NotFound =>
|
|
||||||
format!("○ Empty ticker result for ISIN {}", isin),
|
|
||||||
YahooTickerResult::AmbiguousResults =>
|
|
||||||
format!("⚠ Ambiguous results for ISIN {}", isin),
|
|
||||||
};
|
|
||||||
|
|
||||||
if result.is_found() {
|
tickers.push(format!("YAHOO:{}", details.ticker));
|
||||||
logger::log_info(&log_msg).await;
|
|
||||||
} else {
|
if sector.is_none() && details.sector.is_some() {
|
||||||
logger::log_warn(&log_msg).await;
|
sector = details.sector.clone();
|
||||||
|
logger::log_info(&format!(" Sector: {}", details.sector.as_ref().unwrap())).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
tickers.push(result.to_tagged_string());
|
if exchange.is_none() && details.exchange.is_some() {
|
||||||
|
exchange = details.exchange.clone();
|
||||||
|
logger::log_info(&format!(" Exchange: {}", details.exchange.as_ref().unwrap())).await;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Ok(None) => {
|
||||||
|
logger::log_warn(&format!("◯ No search results for ISIN {}", isin)).await;
|
||||||
|
tickers.push("YAHOO:NO_RESULTS".to_string());
|
||||||
},
|
},
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
logger::log_warn(&format!("✗ Yahoo lookup error for ISIN {}: {}", isin, e)).await;
|
logger::log_warn(&format!("✗ Yahoo lookup error for ISIN {}: {}", isin, e)).await;
|
||||||
tickers.push("YAHOO:ERROR".to_string());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
logger::log_warn(&format!("Skipping Yahoo lookup for {} ISIN {} - already has Yahoo data", name, isin)).await;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only write if we have ticker data
|
if shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if !isin_tickers_map.is_empty() {
|
if !isin_tickers_map.is_empty() {
|
||||||
let company_entry = CompanyCrossPlatformInfo {
|
let company_entry = CompanyCrossPlatformInfo {
|
||||||
name: name.clone(),
|
name: name.clone(),
|
||||||
isin_tickers_map,
|
isin_tickers_map,
|
||||||
|
sector,
|
||||||
|
exchange,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// === APPEND-ONLY: Write single-line JSON with fsync ===
|
||||||
|
// This guarantees the line is either fully written or not at all
|
||||||
let line = serde_json::to_string(&company_entry)?;
|
let line = serde_json::to_string(&company_entry)?;
|
||||||
|
log_file.write_all(line.as_bytes()).await?;
|
||||||
|
log_file.write_all(b"\n").await?;
|
||||||
|
log_file.flush().await?;
|
||||||
|
|
||||||
file.write_all(line.as_bytes()).await?;
|
// Critical: fsync to ensure durability before considering write successful
|
||||||
file.write_all(b"\n").await?;
|
// This prevents data loss on power failure or kernel panic
|
||||||
|
log_file.sync_data().await?;
|
||||||
|
|
||||||
// Flush after each write for crash safety
|
// Update in-memory state ONLY after successful fsync
|
||||||
file.flush().await?;
|
processed_names.insert(name.clone());
|
||||||
|
existing_companies.insert(name.clone(), company_entry);
|
||||||
|
|
||||||
count += 1;
|
count += 1;
|
||||||
|
updates_since_checkpoint += 1;
|
||||||
|
|
||||||
if is_update {
|
if is_update {
|
||||||
updated_count += 1;
|
updated_count += 1;
|
||||||
} else {
|
} else {
|
||||||
new_count += 1;
|
new_count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// === ATOMIC CHECKPOINT: Periodically create checkpoint ===
|
||||||
|
// This reduces recovery time by snapshotting current state
|
||||||
|
if updates_since_checkpoint >= checkpoint_interval {
|
||||||
|
logger::log_info(&format!("Creating checkpoint at {} companies...", count)).await;
|
||||||
|
|
||||||
|
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
||||||
|
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
||||||
|
|
||||||
|
// Write all current state to temporary checkpoint file
|
||||||
|
for company in existing_companies.values() {
|
||||||
|
let line = serde_json::to_string(company)?;
|
||||||
|
checkpoint_file.write_all(line.as_bytes()).await?;
|
||||||
|
checkpoint_file.write_all(b"\n").await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
checkpoint_file.flush().await?;
|
||||||
|
checkpoint_file.sync_all().await?;
|
||||||
|
drop(checkpoint_file);
|
||||||
|
|
||||||
|
// Atomic rename - this is the commit point
|
||||||
|
// After this succeeds, the checkpoint is visible
|
||||||
|
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
||||||
|
|
||||||
|
// Clear log after successful checkpoint
|
||||||
|
// Any entries before this point are now captured in the checkpoint
|
||||||
|
drop(log_file);
|
||||||
|
tokio::fs::remove_file(&log_path).await.ok();
|
||||||
|
log_file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.append(true)
|
||||||
|
.open(&log_path)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
updates_since_checkpoint = 0;
|
||||||
|
logger::log_info("✓ Checkpoint created and log cleared").await;
|
||||||
|
}
|
||||||
|
|
||||||
if count % 10 == 0 {
|
if count % 10 == 0 {
|
||||||
logger::log_info(&format!("Progress: {} companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
logger::log_info(&format!("Progress: {} companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
@@ -232,29 +364,39 @@ async fn build_companies_jsonl_streaming(paths: &DataPaths, pool: &Arc<ChromeDri
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write any remaining existing companies that weren't in securities
|
// === FINAL CHECKPOINT: Write complete final state ===
|
||||||
for (_name, company) in existing_companies {
|
// This ensures we don't need to replay the log on next startup
|
||||||
let line = serde_json::to_string(&company)?;
|
if !shutdown_flag.load(Ordering::SeqCst) && updates_since_checkpoint > 0 {
|
||||||
file.write_all(line.as_bytes()).await?;
|
logger::log_info("Creating final checkpoint...").await;
|
||||||
file.write_all(b"\n").await?;
|
|
||||||
file.flush().await?;
|
let checkpoint_tmp = companies_path.with_extension("jsonl.tmp");
|
||||||
count += 1;
|
let mut checkpoint_file = tokio::fs::File::create(&checkpoint_tmp).await?;
|
||||||
logger::log_warn(&format!("Preserved existing company: {}", _name)).await;
|
|
||||||
|
for company in existing_companies.values() {
|
||||||
|
let line = serde_json::to_string(company)?;
|
||||||
|
checkpoint_file.write_all(line.as_bytes()).await?;
|
||||||
|
checkpoint_file.write_all(b"\n").await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
checkpoint_file.flush().await?;
|
||||||
|
checkpoint_file.sync_all().await?;
|
||||||
|
drop(checkpoint_file);
|
||||||
|
|
||||||
|
// Atomic rename makes final checkpoint visible
|
||||||
|
tokio::fs::rename(&checkpoint_tmp, &companies_path).await?;
|
||||||
|
|
||||||
|
// Clean up log
|
||||||
|
drop(log_file);
|
||||||
|
tokio::fs::remove_file(&log_path).await.ok();
|
||||||
|
|
||||||
|
logger::log_info("✓ Final checkpoint created").await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure all data is written
|
logger::log_info(&format!("Completed: {} total companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
||||||
file.sync_all().await?;
|
|
||||||
drop(file);
|
|
||||||
|
|
||||||
// Atomic rename: replace old file with new one
|
|
||||||
tokio::fs::rename(&temp_path, &companies_path).await?;
|
|
||||||
|
|
||||||
logger::log_info(&format!("✓ Completed: {} total companies ({} new, {} updated)", count, new_count, updated_count)).await;
|
|
||||||
|
|
||||||
Ok(count)
|
Ok(count)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Find most recent FIGI date directory
|
|
||||||
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Option<std::path::PathBuf>> {
|
||||||
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
let map_cache_dir = paths.cache_gleif_openfigi_map_dir();
|
||||||
|
|
||||||
@@ -284,8 +426,6 @@ async fn find_most_recent_figi_date_dir(paths: &DataPaths) -> anyhow::Result<Opt
|
|||||||
Ok(Some(dates[0].1.clone()))
|
Ok(Some(dates[0].1.clone()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pub struct ProcessResult {
|
pub struct ProcessResult {
|
||||||
pub changes: Vec<CompanyEventChange>,
|
pub changes: Vec<CompanyEventChange>,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use tokio::{time::{Duration as TokioDuration, sleep}};
|
|||||||
use std::{sync::Arc};
|
use std::{sync::Arc};
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
|
|
||||||
|
const YAHOO_COMPANY_EXTRACTION_JS: &str = include_str!("yahoo_company_extraction.js");
|
||||||
/// Mapping existing
|
/// Mapping existing
|
||||||
|
|
||||||
/// getting historical stock price data daily (xxxx - 2025) and hourly (last 30 days)
|
/// getting historical stock price data daily (xxxx - 2025) and hourly (last 30 days)
|
||||||
@@ -20,6 +21,16 @@ pub enum YahooTickerResult {
|
|||||||
AmbiguousResults,
|
AmbiguousResults,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ExtractionResult {
|
||||||
|
status: String,
|
||||||
|
ticker: Option<String>,
|
||||||
|
sector: Option<String>,
|
||||||
|
exchange: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
error_message: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
impl YahooTickerResult {
|
impl YahooTickerResult {
|
||||||
pub fn to_tagged_string(&self) -> String {
|
pub fn to_tagged_string(&self) -> String {
|
||||||
match self {
|
match self {
|
||||||
@@ -42,69 +53,52 @@ impl YahooTickerResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn scrape_ticker_by_isin(
|
pub async fn scrape_company_details_by_isin(
|
||||||
pool: &Arc<ChromeDriverPool>,
|
pool: &Arc<ChromeDriverPool>,
|
||||||
isin: &str,
|
isin: &str,
|
||||||
) -> anyhow::Result<YahooTickerResult> {
|
) -> anyhow::Result<Option<YahooCompanyDetails>> {
|
||||||
let isin = isin.to_string();
|
let isin = isin.to_string();
|
||||||
pool.execute(format!("https://finance.yahoo.com/lookup?s={}", isin), move |client| {
|
pool.execute(format!("https://finance.yahoo.com/lookup/?s={}", isin), move |client| {
|
||||||
let isin = isin.clone();
|
let isin = isin.clone();
|
||||||
Box::pin(async move {
|
Box::pin(async move {
|
||||||
sleep(TokioDuration::from_millis(1000)).await;
|
sleep(TokioDuration::from_millis(1000)).await;
|
||||||
reject_yahoo_cookies(&client).await?;
|
reject_yahoo_cookies(&client).await?;
|
||||||
sleep(TokioDuration::from_millis(1000)).await;
|
sleep(TokioDuration::from_millis(1000)).await;
|
||||||
extract_ticker_by_isin(&client, &isin).await
|
extract_company_details(&client, &isin).await
|
||||||
})
|
})
|
||||||
}).await
|
}).await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn extract_ticker_by_isin(
|
pub async fn extract_company_details(
|
||||||
client: &Client,
|
client: &Client,
|
||||||
_isin: &str,
|
_isin: &str,
|
||||||
) -> Result<YahooTickerResult> {
|
) -> Result<Option<YahooCompanyDetails>> {
|
||||||
//let search_url = format!("https://finance.yahoo.com/lookup?s={}", isin);
|
// Execute the JavaScript extraction script
|
||||||
|
let result = client.execute(YAHOO_COMPANY_EXTRACTION_JS, vec![]).await?;
|
||||||
// Check for "No results found" message
|
|
||||||
if client.find(Locator::Css(".noData")).await.is_ok() {
|
|
||||||
return Ok(YahooTickerResult::NoResults);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for results table
|
|
||||||
let table = match client
|
|
||||||
.wait()
|
|
||||||
.for_element(Locator::Css("table[data-test='lookup-table']"))
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(t) => t,
|
|
||||||
Err(_) => return Ok(YahooTickerResult::NoResults),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Find first row
|
// Parse the JSON result
|
||||||
let first_row = match table
|
let extraction: ExtractionResult = serde_json::from_value(result)
|
||||||
.find(Locator::Css("tbody tr"))
|
.map_err(|e| anyhow!("Failed to parse extraction result: {}", e))?;
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(row) => row,
|
|
||||||
Err(_) => return Ok(YahooTickerResult::NoResults),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Extract ticker from first cell
|
match extraction.status.as_str() {
|
||||||
let ticker_cell = first_row
|
"found" => {
|
||||||
.find(Locator::Css("td:nth-child(1)"))
|
if let Some(ticker) = extraction.ticker {
|
||||||
.await
|
Ok(Some(YahooCompanyDetails {
|
||||||
.map_err(|e| anyhow!("Failed to find ticker cell: {}", e))?;
|
ticker,
|
||||||
|
sector: extraction.sector,
|
||||||
let ticker = ticker_cell
|
exchange: extraction.exchange,
|
||||||
.text()
|
}))
|
||||||
.await
|
} else {
|
||||||
.map_err(|e| anyhow!("Failed to get ticker text: {}", e))?
|
Ok(None)
|
||||||
.trim()
|
}
|
||||||
.to_string();
|
},
|
||||||
|
"no_results" => Ok(None),
|
||||||
if ticker.is_empty() {
|
"not_found" => Ok(None),
|
||||||
Ok(YahooTickerResult::NotFound)
|
"error" => {
|
||||||
} else {
|
let error_msg = extraction.error_message.unwrap_or_else(|| "Unknown error".to_string());
|
||||||
Ok(YahooTickerResult::Found(ticker))
|
Err(anyhow!("JavaScript extraction error: {}", error_msg))
|
||||||
|
},
|
||||||
|
_ => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -276,9 +270,9 @@ pub async fn extract_earnings_events(client: &Client, ticker: &str) -> Result<Ve
|
|||||||
}
|
}
|
||||||
|
|
||||||
if events.is_empty() {
|
if events.is_empty() {
|
||||||
eprintln!("Warning: No earnings events extracted for ticker {}", ticker);
|
logger::log_warn(&format!("Warning: No earnings events extracted for ticker {}", ticker)).await;
|
||||||
} else {
|
} else {
|
||||||
println!("Extracted {} earnings events for {}", events.len(), ticker);
|
logger::log_info(&format!("Extracted {} earnings events for {}", events.len(), ticker)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(events)
|
Ok(events)
|
||||||
|
|||||||
67
src/corporate/yahoo_company_extraction.js
Normal file
67
src/corporate/yahoo_company_extraction.js
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
// yahoo_company_extraction.js
|
||||||
|
// JavaScript extraction script for Yahoo Finance company details
|
||||||
|
// Used to extract ticker, sector, and exchange from Yahoo Finance search results
|
||||||
|
|
||||||
|
(function() {
|
||||||
|
try {
|
||||||
|
// Check for "No results found" message
|
||||||
|
const noDataElement = document.querySelector('.noData');
|
||||||
|
if (noDataElement) {
|
||||||
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the results table
|
||||||
|
const table = document.querySelector('table.markets-table');
|
||||||
|
if (!table) {
|
||||||
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the first row in tbody
|
||||||
|
const firstRow = table.querySelector('tbody tr');
|
||||||
|
if (!firstRow) {
|
||||||
|
return { status: 'no_results', ticker: null, sector: null, exchange: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract ticker from first column (td:nth-child(1))
|
||||||
|
const tickerCell = firstRow.querySelector('td:nth-child(1)');
|
||||||
|
const ticker = tickerCell ? tickerCell.textContent.trim() : '';
|
||||||
|
|
||||||
|
if (!ticker) {
|
||||||
|
return { status: 'not_found', ticker: null, sector: null, exchange: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract sector from column 4 (td:nth-child(4) > span > div > a)
|
||||||
|
const sectorCell = firstRow.querySelector('td:nth-child(4) span div a');
|
||||||
|
let sector = sectorCell ? sectorCell.textContent.trim() : '';
|
||||||
|
|
||||||
|
// Normalize empty/invalid values to null
|
||||||
|
if (!sector || sector === '-' || sector === 'N/A') {
|
||||||
|
sector = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract exchange from column 6 (td:nth-child(6) > span)
|
||||||
|
const exchangeCell = firstRow.querySelector('td:nth-child(6) span');
|
||||||
|
let exchange = exchangeCell ? exchangeCell.textContent.trim() : '';
|
||||||
|
|
||||||
|
// Normalize empty/invalid values to null
|
||||||
|
if (!exchange || exchange === '-' || exchange === 'N/A') {
|
||||||
|
exchange = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: 'found',
|
||||||
|
ticker: ticker,
|
||||||
|
sector: sector,
|
||||||
|
exchange: exchange
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
status: 'error',
|
||||||
|
error_message: error.toString(),
|
||||||
|
ticker: null,
|
||||||
|
sector: null,
|
||||||
|
exchange: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
})();
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
// src/economic/scraper.rs
|
// src/economic/scraper.rs
|
||||||
use super::types::{EconomicEvent};
|
use super::types::{EconomicEvent};
|
||||||
|
use event_backtest_engine::logger;
|
||||||
use fantoccini::Client;
|
use fantoccini::Client;
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
|
|
||||||
@@ -49,6 +50,6 @@ pub async fn extract_events(client: &Client) -> anyhow::Result<Vec<EconomicEvent
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
println!("Extracted {} high-impact events", events.len());
|
logger::log_info(&format!("Extracted {} high-impact events", events.len())).await;
|
||||||
Ok(events)
|
Ok(events)
|
||||||
}
|
}
|
||||||
99
src/main.rs
99
src/main.rs
@@ -13,18 +13,17 @@ use scraper::webdriver::ChromeDriverPool;
|
|||||||
use util::directories::DataPaths;
|
use util::directories::DataPaths;
|
||||||
use util::{logger, opnv};
|
use util::{logger, opnv};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
cleanup_all_proxy_containers().await.ok();
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
|
||||||
// Load configuration from .env
|
|
||||||
let config = Config::load().map_err(|err| {
|
let config = Config::load().map_err(|err| {
|
||||||
eprintln!("Failed to load config: {}", err);
|
eprintln!("Failed to load config: {}", err);
|
||||||
err
|
err
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// Initialize paths and logger
|
|
||||||
let paths = DataPaths::new(".")?;
|
let paths = DataPaths::new(".")?;
|
||||||
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
logger::init_debug_logger(paths.logs_dir()).await.ok();
|
||||||
logger::log_info("=== Event Backtest Engine Started ===").await;
|
logger::log_info("=== Event Backtest Engine Started ===").await;
|
||||||
@@ -35,61 +34,36 @@ async fn main() -> Result<()> {
|
|||||||
config.enable_vpn_rotation
|
config.enable_vpn_rotation
|
||||||
)).await;
|
)).await;
|
||||||
|
|
||||||
// === Step 1: Fetch fresh VPNBook credentials and .ovpn files (if rotation enabled) ===
|
// Simple shutdown flag
|
||||||
|
let shutdown_flag = Arc::new(AtomicBool::new(false));
|
||||||
|
|
||||||
|
// === Step 1: Fetch VPNBook configs ===
|
||||||
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
|
let proxy_pool: Option<Arc<DockerVpnProxyPool>> = if config.enable_vpn_rotation {
|
||||||
logger::log_info("VPN Rotation Enabled — Fetching latest VPNBook configs").await;
|
logger::log_info("VPN Rotation Enabled – Fetching latest VPNBook configs").await;
|
||||||
|
let temp_pool = Arc::new(ChromeDriverPool::new_with_proxy_and_task_limit(config.max_parallel_instances, None, config.max_tasks_per_instance).await?);
|
||||||
// We only need 1 Chrome instance to scrape vpnbook.com (no proxy yet)
|
|
||||||
let temp_pool = Arc::new(ChromeDriverPool::new(1).await?);
|
|
||||||
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
let (username, password, _files) = opnv::fetch_vpnbook_configs(&temp_pool, paths.cache_dir()).await?;
|
||||||
|
|
||||||
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
logger::log_info(&format!("VPNBook credentials → User: {}", username)).await;
|
||||||
|
|
||||||
// Count how many distinct servers (subfolders) we have in cache/openvpn/
|
|
||||||
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
let server_count = std::fs::read_dir(paths.cache_openvpn_dir())?
|
||||||
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
.filter(|e| e.as_ref().unwrap().path().is_dir())
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
if server_count == 0 {
|
if server_count == 0 {
|
||||||
logger::log_warn("No VPN servers found — continuing without VPN").await;
|
logger::log_warn("No VPN servers found – continuing without VPN").await;
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
logger::log_info(&format!("Found {} VPN servers — starting Docker proxy containers", server_count)).await;
|
logger::log_info(&format!("Found {} VPN servers – starting Docker proxy containers", server_count)).await;
|
||||||
|
let pp = Arc::new(DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password).await?);
|
||||||
let pp = Arc::new(
|
|
||||||
DockerVpnProxyPool::new(paths.cache_openvpn_dir(), username, password).await?
|
|
||||||
);
|
|
||||||
|
|
||||||
// Verify all proxies are working before proceeding
|
|
||||||
logger::log_info("Verifying all proxy connections...").await;
|
|
||||||
let mut all_working = true;
|
|
||||||
for i in 0..pp.num_proxies() {
|
|
||||||
match pp.test_proxy_connection(i).await {
|
|
||||||
Ok(ip) => {
|
|
||||||
logger::log_info(&format!(" Proxy {}: working with IP: {}", i + 1, ip)).await;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
logger::log_error(&format!(" Proxy {}: FAILED - {}", i + 1, e)).await;
|
|
||||||
all_working = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !all_working {
|
|
||||||
logger::log_warn("Some proxies failed, but continuing with working ones...").await;
|
|
||||||
} else {
|
|
||||||
logger::log_info("All proxies verified and ready!").await;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
|
logger::log_info(&format!("All {} Docker proxy containers started and ready", pp.num_proxies())).await;
|
||||||
Some(pp)
|
Some(pp)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
logger::log_info("VPN rotation disabled — using direct connection").await;
|
logger::log_info("VPN rotation disabled – using direct connection").await;
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
// === Step 2: Initialize the main ChromeDriver pool (with proxy if enabled) ===
|
// === Step 2: Initialize ChromeDriver pool ===
|
||||||
let pool_size = config.max_parallel_instances;
|
let pool_size = config.max_parallel_instances;
|
||||||
let task_limit = config.max_tasks_per_instance;
|
let task_limit = config.max_tasks_per_instance;
|
||||||
|
|
||||||
@@ -105,17 +79,23 @@ async fn main() -> Result<()> {
|
|||||||
|
|
||||||
logger::log_info(&format!("ChromeDriver pool ready with {} instances", pool_size)).await;
|
logger::log_info(&format!("ChromeDriver pool ready with {} instances", pool_size)).await;
|
||||||
|
|
||||||
// === Step 3: Graceful Ctrl+C handler ===
|
// === Step 3: Ctrl+C handler ===
|
||||||
{
|
{
|
||||||
|
let shutdown_flag_clone = Arc::clone(&shutdown_flag);
|
||||||
let pool_clone = Arc::clone(&pool);
|
let pool_clone = Arc::clone(&pool);
|
||||||
let proxy_clone = proxy_pool.clone();
|
let proxy_clone = proxy_pool.clone();
|
||||||
|
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
tokio::signal::ctrl_c().await.ok();
|
tokio::signal::ctrl_c().await.ok();
|
||||||
|
logger::log_info("Ctrl+C received – shutting down gracefully...").await;
|
||||||
|
|
||||||
|
// Set flag first
|
||||||
|
shutdown_flag_clone.store(true, Ordering::SeqCst);
|
||||||
|
|
||||||
|
// Wait a bit for tasks to notice
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
|
||||||
logger::log_info("Ctrl+C received — shutting down gracefully...").await;
|
// Cleanup
|
||||||
|
|
||||||
// Now works: &*pool_clone derefs Arc → &ChromeDriverPool
|
|
||||||
if let Err(e) = (&*pool_clone).shutdown().await {
|
if let Err(e) = (&*pool_clone).shutdown().await {
|
||||||
logger::log_error(&format!("Error during pool shutdown: {}", e)).await;
|
logger::log_error(&format!("Error during pool shutdown: {}", e)).await;
|
||||||
}
|
}
|
||||||
@@ -129,31 +109,34 @@ async fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let _ = cleanup_all_proxy_containers().await;
|
let _ = cleanup_all_proxy_containers().await;
|
||||||
|
|
||||||
std::process::exit(0);
|
std::process::exit(0);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// === Step 4: Run the actual scraping jobs ===
|
// === Step 4: Run scraping jobs ===
|
||||||
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
logger::log_info("--- Starting ECONOMIC data update ---").await;
|
||||||
economic::run_full_update(&config, &pool).await?;
|
economic::run_full_update(&config, &pool).await?;
|
||||||
logger::log_info("Economic update completed").await;
|
logger::log_info("Economic update completed").await;
|
||||||
|
|
||||||
logger::log_info("--- Starting CORPORATE data update ---").await;
|
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
corporate::run_full_update(&config, &pool).await?;
|
logger::log_info("--- Starting CORPORATE data update ---").await;
|
||||||
logger::log_info("Corporate update completed").await;
|
corporate::run_full_update(&config, &pool, &shutdown_flag).await?;
|
||||||
|
logger::log_info("Corporate update completed").await;
|
||||||
// === Step 5: Final cleanup ===
|
|
||||||
logger::log_info("Shutting down ChromeDriver pool...").await;
|
|
||||||
pool.shutdown().await?;
|
|
||||||
|
|
||||||
if let Some(pp) = proxy_pool {
|
|
||||||
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
|
||||||
pp.shutdown().await?;
|
|
||||||
// CLEANUP ANY LEFTOVER CONTAINERS FROM PREVIOUS RUNS
|
|
||||||
cleanup_all_proxy_containers().await.ok();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger::log_info("=== Application finished successfully ===").await;
|
// === Step 5: Final cleanup ===
|
||||||
|
if !shutdown_flag.load(Ordering::SeqCst) {
|
||||||
|
logger::log_info("Shutting down ChromeDriver pool...").await;
|
||||||
|
pool.shutdown().await?;
|
||||||
|
|
||||||
|
if let Some(pp) = proxy_pool {
|
||||||
|
logger::log_info("Stopping Docker VPN proxy containers...").await;
|
||||||
|
pp.shutdown().await?;
|
||||||
|
cleanup_all_proxy_containers().await.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
logger::log_info("=== Application finished successfully ===").await;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -106,10 +106,18 @@ impl DockerVpnProxyPool {
|
|||||||
working_ports.push(port);
|
working_ports.push(port);
|
||||||
}
|
}
|
||||||
Ok(None) => {
|
Ok(None) => {
|
||||||
crate::util::logger::log_warn(&format!("✓ Container {} on port {} ready but IP detection failed",
|
let logs = Command::new("docker")
|
||||||
container_name, port)).await;
|
.args(["logs", "--tail", "20", &container_name])
|
||||||
working_containers.push(container_name);
|
.output()
|
||||||
working_ports.push(port);
|
.await
|
||||||
|
.ok()
|
||||||
|
.and_then(|output| String::from_utf8_lossy(&output.stdout).to_string().into());
|
||||||
|
|
||||||
|
crate::util::logger::log_error(&format!("✗ Container {} on port {} ready but IP detection failed. Logs: {:?}",
|
||||||
|
container_name, port, logs)).await;
|
||||||
|
failed_count += 1;
|
||||||
|
// Clean up failed container
|
||||||
|
let _ = Self::cleanup_container(&container_name).await;
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// Get container logs to debug
|
// Get container logs to debug
|
||||||
@@ -309,25 +317,6 @@ impl DockerVpnProxyPool {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Test if a specific proxy is working
|
|
||||||
pub async fn test_proxy_connection(&self, index: usize) -> Result<String> {
|
|
||||||
let port = self.proxy_ports[index];
|
|
||||||
let proxy_url = format!("socks5://localhost:{}", port);
|
|
||||||
|
|
||||||
let client = reqwest::Client::builder()
|
|
||||||
.proxy(reqwest::Proxy::all(&proxy_url)?)
|
|
||||||
.timeout(Duration::from_secs(10))
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
let response = client.get("http://checkip.amazonaws.com")
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.text()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok(response.trim().to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_proxy_url(&self, index: usize) -> String {
|
pub fn get_proxy_url(&self, index: usize) -> String {
|
||||||
let port = self.proxy_ports[index % self.proxy_ports.len()];
|
let port = self.proxy_ports[index % self.proxy_ports.len()];
|
||||||
format!("socks5://localhost:{}", port)
|
format!("socks5://localhost:{}", port)
|
||||||
|
|||||||
@@ -19,11 +19,15 @@ pub struct ChromeDriverPool {
|
|||||||
semaphore: Arc<Semaphore>,
|
semaphore: Arc<Semaphore>,
|
||||||
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
|
/// Optional Docker-based proxy pool (one proxy per Chrome instance)
|
||||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
|
/// Whether rotation is enabled (uses half of instances at a time)
|
||||||
|
rotation_enabled: bool,
|
||||||
|
/// Index for round-robin instance selection (when rotation is enabled)
|
||||||
|
next_instance: Arc<Mutex<usize>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChromeDriverPool {
|
impl ChromeDriverPool {
|
||||||
/// Creates a new pool without any proxy (direct connection).
|
/// Creates a new pool without any proxy (direct connection).
|
||||||
pub async fn new(pool_size: usize) -> Result<Self> {
|
pub async fn _new(pool_size: usize) -> Result<Self> {
|
||||||
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
|
Self::new_with_proxy_and_task_limit(pool_size, None, 0).await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -40,22 +44,53 @@ impl ChromeDriverPool {
|
|||||||
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
|
Self::new_with_proxy_and_task_limit(pool_size, proxy_pool, 0).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Full constructor: supports proxy + task limiting.
|
/// Full constructor: supports proxy + task limiting + rotation.
|
||||||
|
///
|
||||||
|
/// When rotation is enabled, only half of the instances are used at once,
|
||||||
|
/// rotating to the other half when task limits are reached.
|
||||||
|
///
|
||||||
|
/// The actual pool_size is constrained by:
|
||||||
|
/// - max_parallel_instances from config (pool_size_limit parameter)
|
||||||
|
/// - Available proxies from proxy_pool (if provided)
|
||||||
|
///
|
||||||
|
/// Uses the minimum of these constraints to determine actual pool size.
|
||||||
pub async fn new_with_proxy_and_task_limit(
|
pub async fn new_with_proxy_and_task_limit(
|
||||||
pool_size: usize,
|
pool_size_limit: usize,
|
||||||
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
proxy_pool: Option<Arc<DockerVpnProxyPool>>,
|
||||||
max_tasks_per_instance: usize,
|
max_tasks_per_instance: usize,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let mut instances = Vec::with_capacity(pool_size);
|
// Determine actual pool size based on available resources
|
||||||
|
let actual_pool_size = if let Some(ref pp) = proxy_pool {
|
||||||
|
let available_proxies = pp.num_proxies();
|
||||||
|
pool_size_limit.min(available_proxies)
|
||||||
|
} else {
|
||||||
|
pool_size_limit
|
||||||
|
};
|
||||||
|
|
||||||
|
if actual_pool_size == 0 {
|
||||||
|
return Err(anyhow!("Pool size must be at least 1"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rotation is enabled when task limiting is active
|
||||||
|
let rotation_enabled = max_tasks_per_instance > 0;
|
||||||
|
|
||||||
|
let mut instances = Vec::with_capacity(actual_pool_size);
|
||||||
|
|
||||||
crate::util::logger::log_info(&format!(
|
crate::util::logger::log_info(&format!(
|
||||||
"Initializing ChromeDriver pool with {} instances{}...",
|
"Initializing ChromeDriver pool with {} instances{}{}...",
|
||||||
pool_size,
|
actual_pool_size,
|
||||||
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" }
|
if proxy_pool.is_some() { " (each using a unique Docker SOCKS5 proxy)" } else { "" },
|
||||||
|
if rotation_enabled { " with rotation enabled" } else { "" }
|
||||||
))
|
))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
for i in 0..pool_size {
|
if rotation_enabled && actual_pool_size < 2 {
|
||||||
|
crate::util::logger::log_warn(
|
||||||
|
"Rotation enabled but pool has < 2 instances - rotation will be limited"
|
||||||
|
).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..actual_pool_size {
|
||||||
let proxy_url = proxy_pool
|
let proxy_url = proxy_pool
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|pp| pp.get_proxy_url(i));
|
.map(|pp| pp.get_proxy_url(i));
|
||||||
@@ -68,12 +103,22 @@ impl ChromeDriverPool {
|
|||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
instances,
|
instances,
|
||||||
semaphore: Arc::new(Semaphore::new(pool_size)),
|
semaphore: Arc::new(Semaphore::new(actual_pool_size)),
|
||||||
proxy_pool,
|
proxy_pool,
|
||||||
|
rotation_enabled,
|
||||||
|
next_instance: Arc::new(Mutex::new(0)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Execute a scraping task using an available instance from the pool.
|
/// Execute a scraping task using an available instance from the pool.
|
||||||
|
///
|
||||||
|
/// When rotation is enabled:
|
||||||
|
/// - Uses only half of the instances at a time
|
||||||
|
/// - Rotates to the other half when an instance reaches its task limit
|
||||||
|
/// - Cycles through instances in round-robin fashion within the active half
|
||||||
|
///
|
||||||
|
/// When rotation is disabled:
|
||||||
|
/// - Uses all instances with random selection
|
||||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||||
where
|
where
|
||||||
T: Send + 'static,
|
T: Send + 'static,
|
||||||
@@ -82,8 +127,81 @@ impl ChromeDriverPool {
|
|||||||
{
|
{
|
||||||
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
let _permit = self.semaphore.acquire().await.map_err(|_| anyhow!("Pool closed"))?;
|
||||||
|
|
||||||
// Round-robin selection
|
let index = if self.rotation_enabled {
|
||||||
let index = rand::random_range(..self.instances.len());
|
// Rotation mode: use only half of instances at a time
|
||||||
|
let total_instances = self.instances.len();
|
||||||
|
let half_size = (total_instances + 1) / 2; // Round up for odd numbers
|
||||||
|
|
||||||
|
let mut next_idx = self.next_instance.lock().await;
|
||||||
|
let base_idx = *next_idx;
|
||||||
|
let mut selected_idx = base_idx;
|
||||||
|
let mut found_in_current_half = false;
|
||||||
|
|
||||||
|
// Try to find an available instance in the current half
|
||||||
|
for offset in 0..half_size {
|
||||||
|
let candidate_idx = (base_idx + offset) % half_size;
|
||||||
|
|
||||||
|
// Check if this instance has reached its task limit
|
||||||
|
let instance = &self.instances[candidate_idx];
|
||||||
|
let guard = instance.lock().await;
|
||||||
|
|
||||||
|
if guard.max_tasks_per_instance == 0 ||
|
||||||
|
guard.task_count < guard.max_tasks_per_instance {
|
||||||
|
// This instance is available
|
||||||
|
*next_idx = (candidate_idx + 1) % half_size;
|
||||||
|
selected_idx = candidate_idx;
|
||||||
|
found_in_current_half = true;
|
||||||
|
drop(guard);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
drop(guard);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !found_in_current_half {
|
||||||
|
// All instances in current half are at limit, switch to other half
|
||||||
|
crate::util::logger::log_info(
|
||||||
|
"Current half saturated, rotating to other half of instances"
|
||||||
|
).await;
|
||||||
|
|
||||||
|
let other_half_start = half_size;
|
||||||
|
let other_half_size = total_instances - half_size;
|
||||||
|
|
||||||
|
// Find available instance in other half
|
||||||
|
let mut found_in_other_half = false;
|
||||||
|
for offset in 0..other_half_size {
|
||||||
|
let candidate_idx = other_half_start + offset;
|
||||||
|
|
||||||
|
let instance = &self.instances[candidate_idx];
|
||||||
|
let guard = instance.lock().await;
|
||||||
|
|
||||||
|
if guard.max_tasks_per_instance == 0 ||
|
||||||
|
guard.task_count < guard.max_tasks_per_instance {
|
||||||
|
// Switch to this half for future requests
|
||||||
|
*next_idx = offset;
|
||||||
|
selected_idx = candidate_idx;
|
||||||
|
found_in_other_half = true;
|
||||||
|
drop(guard);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
drop(guard);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !found_in_other_half {
|
||||||
|
// All instances saturated - use round-robin anyway
|
||||||
|
selected_idx = *next_idx % total_instances;
|
||||||
|
*next_idx = (*next_idx + 1) % total_instances;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(next_idx);
|
||||||
|
selected_idx
|
||||||
|
} else {
|
||||||
|
// Non-rotation mode: random selection as before
|
||||||
|
rand::random_range(..self.instances.len())
|
||||||
|
};
|
||||||
|
|
||||||
let instance = self.instances[index].clone();
|
let instance = self.instances[index].clone();
|
||||||
let mut guard = instance.lock().await;
|
let mut guard = instance.lock().await;
|
||||||
|
|
||||||
@@ -91,7 +209,8 @@ impl ChromeDriverPool {
|
|||||||
|
|
||||||
if guard.max_tasks_per_instance > 0 {
|
if guard.max_tasks_per_instance > 0 {
|
||||||
crate::util::logger::log_info(&format!(
|
crate::util::logger::log_info(&format!(
|
||||||
"Instance task count: {}/{}",
|
"Instance {} task count: {}/{}",
|
||||||
|
index,
|
||||||
guard.get_task_count(),
|
guard.get_task_count(),
|
||||||
guard.max_tasks_per_instance
|
guard.max_tasks_per_instance
|
||||||
))
|
))
|
||||||
@@ -130,6 +249,20 @@ impl ChromeDriverPool {
|
|||||||
pub fn get_number_of_instances(&self) -> usize {
|
pub fn get_number_of_instances(&self) -> usize {
|
||||||
self.instances.len()
|
self.instances.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns whether rotation is enabled
|
||||||
|
pub fn is_rotation_enabled(&self) -> bool {
|
||||||
|
self.rotation_enabled
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the size of each half when rotation is enabled
|
||||||
|
pub fn get_rotation_half_size(&self) -> usize {
|
||||||
|
if self.rotation_enabled {
|
||||||
|
(self.instances.len() + 1) / 2
|
||||||
|
} else {
|
||||||
|
self.instances.len()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
|
/// Represents a single instance of chromedriver process, optionally bound to a VPN.
|
||||||
|
|||||||
Reference in New Issue
Block a user