Files
WebScraper/src/corporate/storage.rs

339 lines
11 KiB
Rust

// src/corporate/storage.rs
use super::{types::*, helpers::*};
use crate::util::directories::DataPaths;
use crate::util::logger;
use tokio::fs;
use tokio::io::AsyncWriteExt;
use chrono::{Datelike, NaiveDate};
use std::collections::{HashMap};
use std::path::{PathBuf, Path};
const BATCH_SIZE: usize = 500; // Process 500 events at a time
/// Load events in streaming fashion to avoid memory buildup
pub async fn load_existing_events_streaming(
paths: &DataPaths,
callback: impl Fn(CompanyEvent) -> anyhow::Result<()>
) -> anyhow::Result<usize> {
let dir = paths.corporate_events_dir();
if !dir.exists() {
logger::log_info("Corporate Storage: No existing events directory found").await;
return Ok(0);
}
let mut total = 0;
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("json") {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") && name.len() == 17 {
let content = fs::read_to_string(&path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
for event in events {
callback(event)?;
total += 1;
}
// Yield to prevent blocking
tokio::task::yield_now().await;
}
}
}
logger::log_info(&format!("Corporate Storage: Streamed {} events", total)).await;
Ok(total)
}
/// Build lightweight index of events instead of loading everything
#[derive(Debug, Clone)]
pub struct EventIndex {
pub key: String,
pub ticker: String,
pub date: String,
pub file_path: PathBuf,
}
pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventIndex>> {
let dir = paths.corporate_events_dir();
if !dir.exists() {
return Ok(Vec::new());
}
let mut index = Vec::new();
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("json") {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") && name.len() == 17 {
let content = fs::read_to_string(&path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
for event in events {
index.push(EventIndex {
key: event_key(&event),
ticker: event.ticker.clone(),
date: event.date.clone(),
file_path: path.clone(),
});
}
}
}
}
logger::log_info(&format!("Corporate Storage: Built index with {} entries", index.len())).await;
Ok(index)
}
/// Lookup specific event by loading only its file
pub async fn lookup_event_by_key(
key: &str,
index: &[EventIndex]
) -> anyhow::Result<Option<CompanyEvent>> {
let entry = index.iter().find(|e| e.key == key);
if let Some(entry) = entry {
let content = fs::read_to_string(&entry.file_path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
Ok(events.into_iter().find(|e| event_key(e) == key))
} else {
Ok(None)
}
}
pub async fn save_optimized_events(
paths: &DataPaths,
events: Vec<CompanyEvent> // Changed from HashMap to Vec
) -> anyhow::Result<()> {
let dir = paths.corporate_events_dir();
fs::create_dir_all(dir).await?;
logger::log_info("Corporate Storage: Removing old event files...").await;
let mut removed_count = 0;
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") && path.extension().map(|e| e == "json").unwrap_or(false) {
fs::remove_file(&path).await?;
removed_count += 1;
}
}
logger::log_info(&format!("Corporate Storage: Removed {} old event files", removed_count)).await;
let total_events = events.len();
let mut sorted = events;
sorted.sort_by(|a, b| {
a.ticker.cmp(&b.ticker)
.then(a.date.cmp(&b.date))
});
// Process in batches to avoid memory buildup
let mut by_month: HashMap<String, Vec<CompanyEvent>> = HashMap::new();
for chunk in sorted.chunks(BATCH_SIZE) {
for e in chunk {
if let Ok(d) = NaiveDate::parse_from_str(&e.date, "%Y-%m-%d") {
let key = format!("{}-{:02}", d.year(), d.month());
by_month.entry(key).or_default().push(e.clone());
}
}
tokio::task::yield_now().await;
}
let total_months = by_month.len();
for (month, list) in by_month {
let path = dir.join(format!("events_{}.json", month));
fs::write(&path, serde_json::to_string_pretty(&list)?).await?;
logger::log_info(&format!("Corporate Storage: Saved {} events for month {}", list.len(), month)).await;
}
logger::log_info(&format!("Corporate Storage: Saved {} total events in {} month files", total_events, total_months)).await;
Ok(())
}
pub async fn save_changes(paths: &DataPaths, changes: &[CompanyEventChange]) -> anyhow::Result<()> {
if changes.is_empty() {
logger::log_info("Corporate Storage: No changes to save").await;
return Ok(());
}
let dir = paths.corporate_changes_dir();
fs::create_dir_all(dir).await?;
logger::log_info(&format!("Corporate Storage: Saving {} changes", changes.len())).await;
let mut by_month: HashMap<String, Vec<CompanyEventChange>> = HashMap::new();
for c in changes {
if let Ok(d) = NaiveDate::parse_from_str(&c.date, "%Y-%m-%d") {
let key = format!("{}-{:02}", d.year(), d.month());
by_month.entry(key).or_default().push(c.clone());
}
}
for (month, list) in by_month {
let path = dir.join(format!("changes_{}.json", month));
let mut all = if path.exists() {
let s = fs::read_to_string(&path).await?;
serde_json::from_str(&s).unwrap_or_default()
} else { vec![] };
all.extend(list.clone());
fs::write(&path, serde_json::to_string_pretty(&all)?).await?;
logger::log_info(&format!("Corporate Storage: Saved {} changes for month {}", list.len(), month)).await;
}
logger::log_info("Corporate Storage: All changes saved successfully").await;
Ok(())
}
pub async fn save_prices_for_ticker(
paths: &DataPaths,
ticker: &str,
timeframe: &str,
mut prices: Vec<CompanyPrice>
) -> anyhow::Result<()> {
let base_dir = paths.corporate_prices_dir();
let company_dir = base_dir.join(ticker.replace(".", "_"));
let timeframe_dir = company_dir.join(timeframe);
fs::create_dir_all(&timeframe_dir).await?;
let path = timeframe_dir.join("prices.json");
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
let json = serde_json::to_string_pretty(&prices)?;
fs::write(&path, json).await?;
Ok(())
}
pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf {
paths.corporate_prices_dir().join(lei)
}
pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Result<()> {
let base = get_company_dir(paths, isin);
let paths_to_create = [
base.clone(),
base.join("5min"),
base.join("daily"),
base.join("aggregated").join("5min"),
base.join("aggregated").join("daily"),
];
for p in paths_to_create {
fs::create_dir_all(&p).await?;
}
Ok(())
}
pub async fn save_available_exchanges(
paths: &DataPaths,
isin: &str,
exchanges: Vec<AvailableExchange>
) -> anyhow::Result<()> {
let dir = get_company_dir(paths, isin);
fs::create_dir_all(&dir).await?;
let path = dir.join("available_exchanges.json");
fs::write(&path, serde_json::to_string_pretty(&exchanges)?).await?;
Ok(())
}
pub async fn load_available_exchanges(paths: &DataPaths, lei: &str) -> anyhow::Result<Vec<AvailableExchange>> {
let path = get_company_dir(paths, lei).join("available_exchanges.json");
if path.exists() {
let content = fs::read_to_string(&path).await?;
Ok(serde_json::from_str(&content)?)
} else {
Ok(vec![])
}
}
pub async fn save_prices_by_source(
paths: &DataPaths,
lei: &str,
source_ticker: &str,
timeframe: &str,
prices: Vec<CompanyPrice>,
) -> anyhow::Result<()> {
let source_safe = source_ticker.replace(".", "_").replace("/", "_");
let dir = get_company_dir(paths, lei).join(timeframe).join(&source_safe);
fs::create_dir_all(&dir).await?;
let path = dir.join("prices.json");
let mut prices = prices;
prices.sort_by_key(|p| (p.date.clone(), p.time.clone()));
fs::write(&path, serde_json::to_string_pretty(&prices)?).await?;
Ok(())
}
/// Saves companies data to a JSONL file in streaming fashion
pub async fn save_companies_to_jsonl_streaming(
paths: &DataPaths,
companies: &HashMap<String, HashMap<String, String>>,
) -> anyhow::Result<()> {
let file_path = paths.data_dir().join("companies.jsonl");
logger::log_info(&format!("Corporate Storage: Saving {} companies to JSONL", companies.len())).await;
if let Some(parent) = file_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
let mut file = tokio::fs::File::create(&file_path).await?;
let mut count = 0;
// Process in batches
for (name, securities) in companies.iter() {
let line = serde_json::json!({
"name": name,
"securities": securities
});
file.write_all(line.to_string().as_bytes()).await?;
file.write_all(b"\n").await?;
count += 1;
if count % 100 == 0 {
tokio::task::yield_now().await;
}
}
let msg = format!("✓ Saved {} companies to {:?}", companies.len(), file_path);
println!("{}", msg);
logger::log_info(&msg).await;
Ok(())
}
/// Load companies from JSONL in streaming fashion
pub async fn load_companies_from_jsonl_streaming(
path: &Path,
callback: impl Fn(String, HashMap<String, String>) -> anyhow::Result<()>
) -> anyhow::Result<usize> {
if !path.exists() {
return Ok(0);
}
let content = tokio::fs::read_to_string(path).await?;
let mut count = 0;
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
let entry: serde_json::Value = serde_json::from_str(line)?;
let name = entry["name"].as_str().unwrap_or("").to_string();
let securities: HashMap<String, String> = serde_json::from_value(
entry["securities"].clone()
)?;
callback(name, securities)?;
count += 1;
if count % 100 == 0 {
tokio::task::yield_now().await;
}
}
Ok(count)
}