Files
WebScraper/src/corporate/storage.rs

143 lines
4.2 KiB
Rust

// src/corporate/storage.rs
use super::{types::*, helpers::*};
use crate::util::directories::DataPaths;
use crate::util::logger;
use tokio::fs;
use tokio::io::AsyncWriteExt;
use std::collections::HashMap;
use std::path::{PathBuf, Path};
/// Lightweight index entry - only metadata, no full event data
#[derive(Debug, Clone)]
pub struct EventIndex {
pub key: String,
pub ticker: String,
pub date: String,
pub file_path: PathBuf,
}
/// Build index of all events without loading them into memory
pub async fn build_event_index(paths: &DataPaths) -> anyhow::Result<Vec<EventIndex>> {
let dir = paths.corporate_events_dir();
if !dir.exists() {
logger::log_info("Corporate Storage: No events directory found").await;
return Ok(Vec::new());
}
let mut index = Vec::new();
let mut entries = fs::read_dir(dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("json") {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with("events_") && name.len() == 17 {
let content = fs::read_to_string(&path).await?;
let events: Vec<CompanyEvent> = serde_json::from_str(&content)?;
for event in events {
index.push(EventIndex {
key: event_key(&event),
ticker: event.ticker.clone(),
date: event.date.clone(),
file_path: path.clone(),
});
}
}
}
}
logger::log_info(&format!("Corporate Storage: Built index with {} entries", index.len())).await;
Ok(index)
}
pub fn get_company_dir(paths: &DataPaths, lei: &str) -> PathBuf {
paths.corporate_prices_dir().join(lei)
}
pub async fn ensure_company_dirs(paths: &DataPaths, isin: &str) -> anyhow::Result<()> {
let base = get_company_dir(paths, isin);
let paths_to_create = [
base.clone(),
base.join("5min"),
base.join("daily"),
base.join("aggregated").join("5min"),
base.join("aggregated").join("daily"),
];
for p in paths_to_create {
fs::create_dir_all(&p).await?;
}
Ok(())
}
/// Stream companies to JSONL incrementally
pub async fn save_companies_to_jsonl_streaming(
paths: &DataPaths,
companies_iter: impl Iterator<Item = (String, HashMap<String, String>)>,
) -> anyhow::Result<usize> {
let file_path = paths.data_dir().join("companies.jsonl");
if let Some(parent) = file_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
let mut file = tokio::fs::File::create(&file_path).await?;
let mut count = 0;
for (name, securities) in companies_iter {
let line = serde_json::json!({
"name": name,
"securities": securities
});
file.write_all(line.to_string().as_bytes()).await?;
file.write_all(b"\n").await?;
count += 1;
if count % 100 == 0 {
tokio::task::yield_now().await;
}
}
logger::log_info(&format!("Saved {} companies to JSONL", count)).await;
Ok(count)
}
/// Stream read companies from JSONL
pub async fn stream_companies_from_jsonl<F>(
path: &Path,
mut callback: F
) -> anyhow::Result<usize>
where
F: FnMut(String, HashMap<String, String>) -> anyhow::Result<()>,
{
if !path.exists() {
return Ok(0);
}
let content = tokio::fs::read_to_string(path).await?;
let mut count = 0;
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
let entry: serde_json::Value = serde_json::from_str(line)?;
let name = entry["name"].as_str().unwrap_or("").to_string();
let securities: HashMap<String, String> = serde_json::from_value(
entry["securities"].clone()
)?;
callback(name, securities)?;
count += 1;
if count % 100 == 0 {
tokio::task::yield_now().await;
}
}
Ok(count)
}