added helper functions to reduce bloat

This commit is contained in:
2026-01-09 21:24:18 +01:00
parent ba841248f0
commit c6d301d434
14 changed files with 410 additions and 832 deletions

View File

@@ -1,6 +1,7 @@
// src/corporate/update_companies_enrich_events.rs
use super::{types::*};
use super::{types::*, helpers::*};
use crate::config::Config;
use crate::corporate::checkpoint_helpers;
use crate::util::directories::DataPaths;
use crate::util::logger;
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
@@ -71,7 +72,7 @@ pub async fn enrich_companies_with_events(
logger::log_info(" Yahoo events enrichment already completed").await;
// Count enriched companies
let count = count_enriched_companies(paths).await?;
let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?;
logger::log_info(&format!(" ✓ Found {} companies with event data", count)).await;
return Ok(count);
}
@@ -80,32 +81,7 @@ pub async fn enrich_companies_with_events(
}
// === RECOVERY PHASE: Track enriched companies ===
let mut enriched_companies: HashSet<String> = HashSet::new();
if log_path.exists() {
logger::log_info("Loading enrichment progress from log...").await;
let log_content = tokio::fs::read_to_string(&log_path).await?;
for line in log_content.lines() {
if line.trim().is_empty() || !line.ends_with('}') {
continue; // Skip incomplete lines
}
match serde_json::from_str::<serde_json::Value>(line) {
Ok(entry) => {
if let Some(name) = entry.get("company_name").and_then(|v| v.as_str()) {
if entry.get("status").and_then(|v| v.as_str()) == Some("enriched") {
enriched_companies.insert(name.to_string());
}
}
}
Err(e) => {
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
}
}
}
logger::log_info(&format!("Loaded {} enriched companies from log", enriched_companies.len())).await;
}
let enriched_companies: HashSet<String> = checkpoint_helpers::load_enrichment_progress(&log_path).await?;
// Load all companies from input
logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await;
@@ -128,7 +104,7 @@ pub async fn enrich_companies_with_events(
if pending_count == 0 {
logger::log_info(" ✓ All companies already enriched").await;
mark_enrichment_complete(&state_path).await?;
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
return Ok(enriched_companies.len());
}
@@ -287,7 +263,7 @@ pub async fn enrich_companies_with_events(
// Mark as complete if all companies processed
if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) {
mark_enrichment_complete(&state_path).await?;
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
logger::log_info(" ✓ Event enrichment marked as complete").await;
}
@@ -441,99 +417,6 @@ async fn save_company_event_data(
Ok(())
}
/// Extract first valid Yahoo ticker from company
fn extract_first_yahoo_ticker(company: &CompanyCrossPlatformInfo) -> Option<String> {
for tickers in company.isin_tickers_map.values() {
for ticker in tickers {
if ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
{
return Some(ticker.trim_start_matches("YAHOO:").to_string());
}
}
}
None
}
/// Sanitize company name for file system
fn sanitize_company_name(name: &str) -> String {
name.replace("/", "_")
.replace("\\", "_")
.replace(":", "_")
.replace("*", "_")
.replace("?", "_")
.replace("\"", "_")
.replace("<", "_")
.replace(">", "_")
.replace("|", "_")
}
/// Load companies from JSONL file
async fn load_companies_from_jsonl(path: &std::path::Path) -> anyhow::Result<Vec<CompanyCrossPlatformInfo>> {
let content = tokio::fs::read_to_string(path).await?;
let mut companies = Vec::new();
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(company) = serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
companies.push(company);
}
}
Ok(companies)
}
/// Count enriched companies (companies with event data)
async fn count_enriched_companies(paths: &DataPaths) -> anyhow::Result<usize> {
let corporate_dir = paths.corporate_dir();
if !corporate_dir.exists() {
return Ok(0);
}
let mut count = 0;
let mut entries = tokio::fs::read_dir(&corporate_dir).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_dir() {
let events_dir = path.join("events");
let events_file = events_dir.join("data.jsonl");
if events_file.exists() {
count += 1;
}
}
}
Ok(count)
}
/// Mark enrichment as complete in state file
async fn mark_enrichment_complete(state_path: &std::path::Path) -> anyhow::Result<()> {
let enrichment_complete = json!({
"yahoo_events_enrichment_complete": true,
"completed_at": Utc::now().to_rfc3339(),
});
let mut state_file = OpenOptions::new()
.create(true)
.append(true)
.open(state_path)
.await?;
let state_line = serde_json::to_string(&enrichment_complete)?;
state_file.write_all(state_line.as_bytes()).await?;
state_file.write_all(b"\n").await?;
state_file.flush().await?;
state_file.sync_all().await?;
Ok(())
}
/// Log command enum
enum LogCommand {
Write(serde_json::Value),