added helper functions to reduce bloat
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
// src/corporate/update_companies_enrich_events.rs
|
||||
use super::{types::*};
|
||||
use super::{types::*, helpers::*};
|
||||
use crate::config::Config;
|
||||
use crate::corporate::checkpoint_helpers;
|
||||
use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
use crate::scraper::yahoo::{YahooClientPool, QuoteSummaryModule};
|
||||
@@ -71,7 +72,7 @@ pub async fn enrich_companies_with_events(
|
||||
logger::log_info(" Yahoo events enrichment already completed").await;
|
||||
|
||||
// Count enriched companies
|
||||
let count = count_enriched_companies(paths).await?;
|
||||
let count = checkpoint_helpers::count_enriched_companies(paths, "events").await?;
|
||||
logger::log_info(&format!(" ✓ Found {} companies with event data", count)).await;
|
||||
return Ok(count);
|
||||
}
|
||||
@@ -80,32 +81,7 @@ pub async fn enrich_companies_with_events(
|
||||
}
|
||||
|
||||
// === RECOVERY PHASE: Track enriched companies ===
|
||||
let mut enriched_companies: HashSet<String> = HashSet::new();
|
||||
|
||||
if log_path.exists() {
|
||||
logger::log_info("Loading enrichment progress from log...").await;
|
||||
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||
|
||||
for line in log_content.lines() {
|
||||
if line.trim().is_empty() || !line.ends_with('}') {
|
||||
continue; // Skip incomplete lines
|
||||
}
|
||||
|
||||
match serde_json::from_str::<serde_json::Value>(line) {
|
||||
Ok(entry) => {
|
||||
if let Some(name) = entry.get("company_name").and_then(|v| v.as_str()) {
|
||||
if entry.get("status").and_then(|v| v.as_str()) == Some("enriched") {
|
||||
enriched_companies.insert(name.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Loaded {} enriched companies from log", enriched_companies.len())).await;
|
||||
}
|
||||
let enriched_companies: HashSet<String> = checkpoint_helpers::load_enrichment_progress(&log_path).await?;
|
||||
|
||||
// Load all companies from input
|
||||
logger::log_info("Loading companies from companies_yahoo_cleaned.jsonl...").await;
|
||||
@@ -128,7 +104,7 @@ pub async fn enrich_companies_with_events(
|
||||
|
||||
if pending_count == 0 {
|
||||
logger::log_info(" ✓ All companies already enriched").await;
|
||||
mark_enrichment_complete(&state_path).await?;
|
||||
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
|
||||
return Ok(enriched_companies.len());
|
||||
}
|
||||
|
||||
@@ -287,7 +263,7 @@ pub async fn enrich_companies_with_events(
|
||||
|
||||
// Mark as complete if all companies processed
|
||||
if final_processed >= total_companies && !shutdown_flag.load(Ordering::SeqCst) {
|
||||
mark_enrichment_complete(&state_path).await?;
|
||||
checkpoint_helpers::mark_step_complete(&state_path, "yahoo_events_enrichment_complete").await?;
|
||||
logger::log_info(" ✓ Event enrichment marked as complete").await;
|
||||
}
|
||||
|
||||
@@ -441,99 +417,6 @@ async fn save_company_event_data(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Extract first valid Yahoo ticker from company
|
||||
fn extract_first_yahoo_ticker(company: &CompanyCrossPlatformInfo) -> Option<String> {
|
||||
for tickers in company.isin_tickers_map.values() {
|
||||
for ticker in tickers {
|
||||
if ticker.starts_with("YAHOO:")
|
||||
&& ticker != "YAHOO:NO_RESULTS"
|
||||
&& ticker != "YAHOO:ERROR"
|
||||
{
|
||||
return Some(ticker.trim_start_matches("YAHOO:").to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Sanitize company name for file system
|
||||
fn sanitize_company_name(name: &str) -> String {
|
||||
name.replace("/", "_")
|
||||
.replace("\\", "_")
|
||||
.replace(":", "_")
|
||||
.replace("*", "_")
|
||||
.replace("?", "_")
|
||||
.replace("\"", "_")
|
||||
.replace("<", "_")
|
||||
.replace(">", "_")
|
||||
.replace("|", "_")
|
||||
}
|
||||
|
||||
/// Load companies from JSONL file
|
||||
async fn load_companies_from_jsonl(path: &std::path::Path) -> anyhow::Result<Vec<CompanyCrossPlatformInfo>> {
|
||||
let content = tokio::fs::read_to_string(path).await?;
|
||||
let mut companies = Vec::new();
|
||||
|
||||
for line in content.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
if let Ok(company) = serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||
companies.push(company);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(companies)
|
||||
}
|
||||
|
||||
/// Count enriched companies (companies with event data)
|
||||
async fn count_enriched_companies(paths: &DataPaths) -> anyhow::Result<usize> {
|
||||
let corporate_dir = paths.corporate_dir();
|
||||
|
||||
if !corporate_dir.exists() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let mut count = 0;
|
||||
let mut entries = tokio::fs::read_dir(&corporate_dir).await?;
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
let events_dir = path.join("events");
|
||||
let events_file = events_dir.join("data.jsonl");
|
||||
|
||||
if events_file.exists() {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
/// Mark enrichment as complete in state file
|
||||
async fn mark_enrichment_complete(state_path: &std::path::Path) -> anyhow::Result<()> {
|
||||
let enrichment_complete = json!({
|
||||
"yahoo_events_enrichment_complete": true,
|
||||
"completed_at": Utc::now().to_rfc3339(),
|
||||
});
|
||||
|
||||
let mut state_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(state_path)
|
||||
.await?;
|
||||
|
||||
let state_line = serde_json::to_string(&enrichment_complete)?;
|
||||
state_file.write_all(state_line.as_bytes()).await?;
|
||||
state_file.write_all(b"\n").await?;
|
||||
state_file.flush().await?;
|
||||
state_file.sync_all().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Log command enum
|
||||
enum LogCommand {
|
||||
Write(serde_json::Value),
|
||||
|
||||
Reference in New Issue
Block a user