cleaned up update.rs eco and corp

This commit is contained in:
2026-01-09 19:52:26 +01:00
parent 8dd75f7bdf
commit ba841248f0
11 changed files with 55 additions and 296 deletions

View File

@@ -48,7 +48,7 @@ pub async fn aggregate_best_price_data(paths: &DataPaths, lei: &str) -> anyhow::
if !prices_path.exists() { continue; } if !prices_path.exists() { continue; }
let content = tokio::fs::read_to_string(&prices_path).await?; let content = tokio::fs::read_to_string(&prices_path).await?;
let mut prices: Vec<CompanyPrice> = serde_json::from_str(&content)?; let prices: Vec<CompanyPrice> = serde_json::from_str(&content)?;
if !prices.is_empty() { if !prices.is_empty() {
sources_used.insert(source_ticker.clone()); sources_used.insert(source_ticker.clone());
@@ -80,8 +80,8 @@ pub async fn aggregate_best_price_data(paths: &DataPaths, lei: &str) -> anyhow::
p.date.clone() p.date.clone()
}; };
// Convert to USD immediately // Convert to USD immediately DUMMY -------------------------------------------------------------------------------------------
let usd_rate = super::fx::get_usd_rate(&p.currency).await.unwrap_or(1.0); let usd_rate = 0.1;
let mut p_usd = p.clone(); let mut p_usd = p.clone();
p_usd.open *= usd_rate; p_usd.open *= usd_rate;

View File

@@ -1,51 +0,0 @@
// src/corporate/fx.rs
use std::collections::HashMap;
use reqwest;
use serde_json::Value;
use tokio::fs;
use std::path::Path;
static FX_CACHE_PATH: &str = "fx_rates.json";
pub async fn get_usd_rate(currency: &str) -> anyhow::Result<f64> {
if currency == "USD" {
return Ok(1.0);
}
let mut cache: HashMap<String, (f64, String)> = if Path::new(FX_CACHE_PATH).exists() {
let content = fs::read_to_string(FX_CACHE_PATH).await?;
serde_json::from_str(&content).unwrap_or_default()
} else {
HashMap::new()
};
let today = chrono::Local::now().format("%Y-%m-%d").to_string();
if let Some((rate, date)) = cache.get(currency) {
if date == &today {
return Ok(*rate);
}
}
let symbol = format!("{}USD=X", currency);
let url = format!("https://query1.finance.yahoo.com/v8/finance/chart/{}?range=1d&interval=1d", symbol);
let json: Value = reqwest::Client::new()
.get(&url)
.header("User-Agent", "Mozilla/5.0")
.send()
.await?
.json()
.await?;
let close = json["chart"]["result"][0]["meta"]["regularMarketPrice"]
.as_f64()
.or_else(|| json["chart"]["result"][0]["indicators"]["quote"][0]["close"][0].as_f64())
.unwrap_or(1.0);
let rate = if currency == "JPY" || currency == "KRW" { close } else { 1.0 / close }; // inverse pairs
cache.insert(currency.to_string(), (rate, today.clone()));
let _ = fs::write(FX_CACHE_PATH, serde_json::to_string_pretty(&cache)?).await;
Ok(rate)
}

View File

@@ -1,7 +1,6 @@
// src/corporate/helpers.rs // src/corporate/helpers.rs
use super::types::*; use super::types::*;
use chrono::{Local, NaiveDate}; use chrono::{Local, NaiveDate};
use std::collections::{HashMap, HashSet};
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::prelude::{Rng, SeedableRng, IndexedRandom}; use rand::prelude::{Rng, SeedableRng, IndexedRandom};
@@ -74,7 +73,7 @@ pub fn parse_yahoo_date(s: &str) -> anyhow::Result<NaiveDate> {
/// Send-safe random range /// Send-safe random range
pub fn random_range(min: u64, max: u64) -> u64 { pub fn random_range(min: u64, max: u64) -> u64 {
let mut rng = StdRng::from_rng(&mut rand::rng()); let mut rng = StdRng::from_rng(&mut rand::rng());
rng.gen_range(min..max) rng.random_range(min..max)
} }
/// Send-safe random choice /// Send-safe random choice

View File

@@ -4,9 +4,8 @@ pub mod scraper;
pub mod storage; pub mod storage;
pub mod helpers; pub mod helpers;
pub mod aggregation; pub mod aggregation;
pub mod fx;
pub mod openfigi; pub mod openfigi;
pub mod yahoo; pub mod yahoo_company_extraction;
pub mod page_validation; pub mod page_validation;
pub mod atomic_writer; pub mod atomic_writer;

View File

@@ -5,11 +5,9 @@ use crate::util::logger;
use tokio::fs; use tokio::fs;
use tokio::io::AsyncWriteExt; use tokio::io::AsyncWriteExt;
use chrono::{Datelike, NaiveDate};
use std::collections::HashMap; use std::collections::HashMap;
use std::path::{PathBuf, Path}; use std::path::{PathBuf, Path};
const BATCH_SIZE: usize = 500;
/// Lightweight index entry - only metadata, no full event data /// Lightweight index entry - only metadata, no full event data
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@@ -55,162 +53,6 @@ pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventInd
Ok(index) Ok(index)
} }
/// Load specific event by key (only loads its file)
pub async fn lookup_event_by_key(
key: &str,
index: &[EventIndex]
) -> anyhow::Result<Option<CompanyEvent>> {
let entry = index.iter().find(|e| e.key == key);
if let Some(entry) = entry {
let content = fs::read_to_string(&entry.file_path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
Ok(events.into_iter().find(|e| event_key(e) == key))
} else {
Ok(None)
}
}
/// Stream events file by file with callback
pub async fn stream_events_with_callback<F>(
paths: &DataPaths,
mut callback: F
) -> anyhow::Result<usize>
where
F: FnMut(CompanyEvent) -> anyhow::Result<()>,
{
let dir = paths.corporate_events_dir();
if !dir.exists() {
return Ok(0);
}
let mut total = 0;
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("json") {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") {
let content = fs::read_to_string(&path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
for event in events {
callback(event)?;
total += 1;
}
tokio::task::yield_now().await;
}
}
}
logger::log_info(&format!("Corporate Storage: Streamed {} events", total)).await;
Ok(total)
}
/// Save events organized by month (accepts Vec, not HashMap)
pub async fn save_optimized_events(
paths: &DataPaths,
events: Vec<CompanyEvent>
) -> anyhow::Result<()> {
let dir = paths.corporate_events_dir();
fs::create_dir_all(dir).await?;
logger::log_info("Corporate Storage: Removing old event files...").await;
let mut removed_count = 0;
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
fs::remove_file(&path).await?;
removed_count += 1;
}
}
logger::log_info(&format!("Corporate Storage: Removed {} old files", removed_count)).await;
let total_events = events.len();
let mut sorted = events;
sorted.sort_by(|a, b| {
a.ticker.cmp(&b.ticker).then(a.date.cmp(&b.date))
});
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
for chunk in sorted.chunks(BATCH_SIZE) {
for e in chunk {
if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") {
let key = format!("{}-{:02}", d.year(), d.month());
by_month.entry(key).or_default().push(e.clone());
}
}
tokio::task::yield_now().await;
}
for (month, list) in by_month {
let path = dir.join(format!("events_{}.json", month));
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
logger::log_info(&format!("Saved {} events for month {}", list.len(), month)).await;
}
logger::log_info(&format!("Saved {} total events", total_events)).await;
Ok(())
}
pub async fn save_changes(
paths: &DataPaths,
changes: &[CompanyEventChange]
) -> anyhow::Result<()> {
if changes.is_empty() {
logger::log_info("Corporate Storage: No changes to save").await;
return Ok(());
}
let dir = paths.corporate_changes_dir();
fs::create_dir_all(dir).await?;
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
for c in changes {
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
let key = format!("{}-{:02}", d.year(), d.month());
by_month.entry(key).or_default().push(c.clone());
}
}
for (month, list) in by_month {
let path = dir.join(format!("changes_{}.json", month));
let mut all = if path.exists() {
let s = fs::read_to_string(&path).await?;
serde_json::from_str(&s).unwrap_or_default()
} else {
vec![]
};
all.extend(list.clone());
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
}
Ok(())
}
pub async fn save_prices_for_ticker(
paths: &DataPaths,
ticker: &str,
timeframe: &str,
mut prices: Vec<CompanyPrice>
) -> anyhow::Result<()> {
let base_dir = paths.corporate_prices_dir();
let company_dir = base_dir.join(ticker.replace(".", "_"));
let timeframe_dir = company_dir.join(timeframe);
fs::create_dir_all(&timeframe_dir).await?;
let path = timeframe_dir.join("prices.json");
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
Ok(())
}
pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf { pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf {
paths.corporate_prices_dir().join(lei) paths.corporate_prices_dir().join(lei)
} }
@@ -230,48 +72,6 @@ pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Resul
Ok(()) Ok(())
} }
pub async fn save_available_exchanges(
paths: &DataPaths,
isin: &str,
exchanges: Vec<AvailableExchange>
) -> anyhow::Result<()> {
let dir = get_company_dir(paths, isin);
fs::create_dir_all(&dir).await?;
let path = dir.join("available_exchanges.json");
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
Ok(())
}
pub async fn load_available_exchanges(
paths: &DataPaths,
lei: &str
) -> anyhow::Result<Vec<AvailableExchange>> {
let path = get_company_dir(paths, lei).join("available_exchanges.json");
if path.exists() {
let content = fs::read_to_string(&path).await?;
Ok(serde_json::from_str(&content)?)
} else {
Ok(vec![])
}
}
pub async fn save_prices_by_source(
paths: &DataPaths,
lei: &str,
source_ticker: &str,
timeframe: &str,
prices: Vec<CompanyPrice>,
) -> anyhow::Result<()> {
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
let dir = get_company_dir(paths, lei).join(timeframe).join(&source_safe);
fs::create_dir_all(&dir).await?;
let path = dir.join("prices.json");
let mut prices = prices;
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
Ok(())
}
/// Stream companies to JSONL incrementally /// Stream companies to JSONL incrementally
pub async fn save_companies_to_jsonl_streaming( pub async fn save_companies_to_jsonl_streaming(
paths: &DataPaths, paths: &DataPaths,

View File

@@ -1,5 +1,5 @@
// src/corporate/update.rs // src/corporate/update.rs
use super::{scraper::*, storage::*, openfigi::*}; use super::{scraper::*, openfigi::*};
use crate::config::Config; use crate::config::Config;
use crate::corporate::update_companies::build_companies_jsonl_streaming_parallel; use crate::corporate::update_companies::build_companies_jsonl_streaming_parallel;
use crate::corporate::update_companies_cleanse::{companies_yahoo_cleansed_low_profile, companies_yahoo_cleansed_no_data}; use crate::corporate::update_companies_cleanse::{companies_yahoo_cleansed_low_profile, companies_yahoo_cleansed_no_data};
@@ -22,7 +22,7 @@ pub async fn run_full_update(
pool: &Arc<ChromeDriverPool>, pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>, shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
logger::log_info("=== Corporate Update (STREAMING MODE WITH DATA INTEGRITY) ===").await; logger::log_info("=== Corporate Update ===").await;
let paths = DataPaths::new(".")?; let paths = DataPaths::new(".")?;
@@ -42,7 +42,7 @@ pub async fn run_full_update(
logger::log_warn("Shutdown detected after GLEIF download").await; logger::log_warn("Shutdown detected after GLEIF download").await;
return Ok(()); return Ok(());
} }
if !shutdown_flag.load(Ordering::SeqCst) { if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 2: Loading OpenFIGI metadata...").await; logger::log_info("Step 2: Loading OpenFIGI metadata...").await;
load_figi_type_lists().await.ok(); load_figi_type_lists().await.ok();
@@ -144,15 +144,7 @@ pub async fn run_full_update(
} }
if !shutdown_flag.load(Ordering::SeqCst) { if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 11: Processing events (using index)...").await; logger::log_info("Step 11: Collecting FX rates...").await;
let _event_index = build_event_index(&paths).await?;
logger::log_info(" ✓ Event index built").await;
} else {
logger::log_warn("Shutdown detected, skipping event index build").await;
}
if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 12: Collecting FX rates...").await;
let proxy_pool = pool.get_proxy_pool() let proxy_pool = pool.get_proxy_pool()
.ok_or_else(|| anyhow::anyhow!("ChromeDriverPool must have proxy rotation"))?; .ok_or_else(|| anyhow::anyhow!("ChromeDriverPool must have proxy rotation"))?;
@@ -166,14 +158,14 @@ pub async fn run_full_update(
} }
if !shutdown_flag.load(Ordering::SeqCst) { if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("Step 13: Collecting exchange information...").await; logger::log_info("Step 12: Collecting exchange information...").await;
let exchange_count = collect_and_save_exchanges(&paths).await?; let exchange_count = collect_and_save_exchanges(&paths).await?;
logger::log_info(&format!(" ✓ Collected {} exchanges", exchange_count)).await; logger::log_info(&format!(" ✓ Collected {} exchanges", exchange_count)).await;
} else { } else {
logger::log_warn("Shutdown detected, skipping exchange collection").await; logger::log_warn("Shutdown detected, skipping exchange collection").await;
} }
logger::log_info(" Corporate update complete").await; logger::log_info("=== Corporate update complete === ").await;
Ok(()) Ok(())
} }

View File

@@ -1,5 +1,5 @@
// src/corporate/update_companies.rs // src/corporate/update_companies.rs
use super::{types::*, yahoo::*, helpers::*}; use super::{types::*, yahoo_company_extraction::*, helpers::*};
use crate::util::directories::DataPaths; use crate::util::directories::DataPaths;
use crate::util::logger; use crate::util::logger;
use crate::scraper::webdriver::ChromeDriverPool; use crate::scraper::webdriver::ChromeDriverPool;

View File

@@ -2,11 +2,11 @@
use super::{scraper::*, storage::*, helpers::*, types::*}; use super::{scraper::*, storage::*, helpers::*, types::*};
use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger}; use crate::{config::Config, scraper::webdriver::{ScrapeTask, ChromeDriverPool}, util::directories::DataPaths, util::logger};
use chrono::{Local}; use chrono::{Local};
use std::sync::Arc; use std::sync::{Arc, atomic::{AtomicBool, Ordering}};
use std::collections::HashMap; use std::collections::HashMap;
/// Runs the full update for economic data using streaming to minimize memory usage /// Runs the full update for economic data using streaming to minimize memory usage
pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> anyhow::Result<()> { pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>, shutdown_flag: &Arc<AtomicBool>) -> anyhow::Result<()> {
let paths = DataPaths::new(".")?; let paths = DataPaths::new(".")?;
logger::log_info("Economic Update: Initializing...").await; logger::log_info("Economic Update: Initializing...").await;
@@ -14,17 +14,23 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string(); let today_str = chrono::Local::now().date_naive().format("%Y-%m-%d").to_string();
let end_date = config.target_end_date(); let end_date = config.target_end_date();
logger::log_info("=== Economic Update ===").await;
// Step 1: Build lightweight index instead of loading all events // Step 1: Build lightweight index instead of loading all events
logger::log_info("Economic Update: Building event index...").await; logger::log_info("Step 1: Building event index...").await;
let chunks = scan_existing_chunks(&paths).await?; let chunks = scan_existing_chunks(&paths).await?;
let event_index = build_event_index(&chunks).await?; let event_index = build_event_index(&chunks).await?;
logger::log_info(&format!(" Economic Update: Indexed {} events from {} chunks",
logger::log_info(&format!("Economic Update: Indexed {} events from {} chunks",
event_index.len(), chunks.len())).await; event_index.len(), chunks.len())).await;
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected after GLEIF download").await;
return Ok(());
}
// Step 2: Determine start date // Step 2: Determine start date
let start_date = if event_index.is_empty() { let start_date = if event_index.is_empty() {
logger::log_warn("Economic Update: No existing events found, starting from config date").await; logger::log_warn("Step 2: No existing events found, starting from config date").await;
config.economic_start_date.clone() config.economic_start_date.clone()
} else { } else {
// Find the latest date in the index // Find the latest date in the index
@@ -35,7 +41,7 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
.unwrap_or(today_str.clone()); .unwrap_or(today_str.clone());
if max_date >= today_str { if max_date >= today_str {
logger::log_info("Economic Update: Events exist for today, starting from today").await; logger::log_info(" Events exist for today, starting from today").await;
today_str.clone() today_str.clone()
} else { } else {
let next = chrono::NaiveDate::parse_from_str(&max_date, "%Y-%m-%d") let next = chrono::NaiveDate::parse_from_str(&max_date, "%Y-%m-%d")
@@ -43,34 +49,46 @@ pub async fn run_full_update(config: &Config, pool: &Arc<ChromeDriverPool>) -> a
.and_then(|d| d.succ_opt()) .and_then(|d| d.succ_opt())
.map(|d| d.format("%Y-%m-%d").to_string()) .map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or(today_str.clone()); .unwrap_or(today_str.clone());
logger::log_info(&format!("Economic Update: Resuming from: {}", next)).await; logger::log_info(&format!(" Resuming from: {}", next)).await;
next next
} }
}; };
logger::log_info(&format!("Economic Update: Scraping events from {}{}", start_date, end_date)).await; if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected after GLEIF download").await;
return Ok(());
}
// Step 3: Scrape new events in batches // Step 3: Scrape new events in batches
logger::log_info(&format!("Step 3: Scraping events from {}{}", start_date, end_date)).await;
let new_events = scrape_all_economic_events(&start_date, &end_date, pool).await?; let new_events = scrape_all_economic_events(&start_date, &end_date, pool).await?;
logger::log_info(&format!(" Scraped {} new events", new_events.len())).await;
logger::log_info(&format!("Economic Update: Scraped {} new events", new_events.len())).await;
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected after GLEIF download").await;
return Ok(());
}
// Step 4: Process events in streaming fashion // Step 4: Process events in streaming fashion
logger::log_info(&format!("Step 4: Detecting changes")).await;
let (changes, updated_events) = process_events_streaming(&chunks, &new_events, &today_str).await?; let (changes, updated_events) = process_events_streaming(&chunks, &new_events, &today_str).await?;
logger::log_info(&format!(" Detected {} changes", changes.len())).await;
logger::log_info(&format!("Economic Update: Detected {} changes", changes.len())).await;
if !changes.is_empty() { if !changes.is_empty() {
logger::log_info(&format!("Economic Update: Saving {} changes to log", changes.len())).await; logger::log_info(&format!(" Saving {} changes to log", changes.len())).await;
save_changes(&paths, &changes).await?; save_changes(&paths, &changes).await?;
logger::log_info("Economic Update: Changes saved successfully").await; logger::log_info(" Changes saved successfully").await;
}
if shutdown_flag.load(Ordering::SeqCst) {
logger::log_warn("Shutdown detected after GLEIF download").await;
return Ok(());
} }
// Step 5: Save consolidated events // Step 5: Save consolidated events
logger::log_info(&format!("Economic Update: Saving {} total events to chunks", updated_events.len())).await; logger::log_info(&format!("Step 5: Saving {} total events to chunks", updated_events.len())).await;
save_optimized_chunks(&paths, updated_events).await?; save_optimized_chunks(&paths, updated_events).await?;
logger::log_info(&format!(" ✓ Economic update complete — {} changes detected", changes.len())).await;
logger::log_info(&format!("✓ Economic update complete — {} changes detected", changes.len())).await;
Ok(()) Ok(())
} }

View File

@@ -1,4 +1,4 @@
// src/forex/update_rates.rs // src/forex/update_forex.rs
use crate::config::Config; use crate::config::Config;
use crate::util::directories::DataPaths; use crate::util::directories::DataPaths;
use crate::util::logger; use crate::util::logger;

View File

@@ -229,9 +229,11 @@ async fn main() -> Result<()> {
} }
// === Step 4: Run scraping jobs === // === Step 4: Run scraping jobs ===
logger::log_info("--- Starting ECONOMIC data update ---").await; if !shutdown_flag.load(Ordering::SeqCst) {
economic::run_full_update(&config, &pool).await?; logger::log_info("--- Starting ECONOMIC data update ---").await;
logger::log_info("Economic update completed").await; economic::run_full_update(&config, &pool, &shutdown_flag).await?;
logger::log_info("Economic update completed").await;
}
if !shutdown_flag.load(Ordering::SeqCst) { if !shutdown_flag.load(Ordering::SeqCst) {
logger::log_info("--- Starting CORPORATE data update ---").await; logger::log_info("--- Starting CORPORATE data update ---").await;