removed crossplatformcompany from types

This commit is contained in:
2026-01-14 14:49:00 +01:00
parent 93fbefc9d4
commit f4b20f824d
8 changed files with 83 additions and 87 deletions

View File

@@ -4,7 +4,7 @@
//! This module extracts common patterns used across multiple update modules //! This module extracts common patterns used across multiple update modules
//! to reduce code duplication and improve maintainability. //! to reduce code duplication and improve maintainability.
use super::types::CompanyCrossPlatformData; use super::types::CompanyData;
use crate::util::logger; use crate::util::logger;
use std::collections::HashMap; use std::collections::HashMap;
use std::path::{Path}; use std::path::{Path};
@@ -22,7 +22,7 @@ pub async fn load_checkpoint_with_log<P1, P2>(
checkpoint_path: P1, checkpoint_path: P1,
log_path: P2, log_path: P2,
checkpoint_desc: &str, checkpoint_desc: &str,
) -> Result<HashMap<String, CompanyCrossPlatformData>> ) -> Result<HashMap<String, CompanyData>>
where where
P1: AsRef<Path>, P1: AsRef<Path>,
P2: AsRef<Path>, P2: AsRef<Path>,
@@ -30,7 +30,7 @@ where
let checkpoint_path = checkpoint_path.as_ref(); let checkpoint_path = checkpoint_path.as_ref();
let log_path = log_path.as_ref(); let log_path = log_path.as_ref();
let mut companies: HashMap<String, CompanyCrossPlatformData> = HashMap::new(); let mut companies: HashMap<String, CompanyData> = HashMap::new();
// Load checkpoint if it exists // Load checkpoint if it exists
if checkpoint_path.exists() { if checkpoint_path.exists() {
@@ -42,7 +42,7 @@ where
continue; // Skip incomplete lines continue; // Skip incomplete lines
} }
match serde_json::from_str::<CompanyCrossPlatformData>(line) { match serde_json::from_str::<CompanyData>(line) {
Ok(company) => { Ok(company) => {
companies.insert(company.name.clone(), company); companies.insert(company.name.clone(), company);
} }
@@ -65,7 +65,7 @@ where
continue; // Skip incomplete lines continue; // Skip incomplete lines
} }
match serde_json::from_str::<CompanyCrossPlatformData>(line) { match serde_json::from_str::<CompanyData>(line) {
Ok(company) => { Ok(company) => {
companies.insert(company.name.clone(), company); companies.insert(company.name.clone(), company);
replayed += 1; replayed += 1;
@@ -91,7 +91,7 @@ where
pub async fn consolidate_checkpoint<P1, P2>( pub async fn consolidate_checkpoint<P1, P2>(
checkpoint_path: P1, checkpoint_path: P1,
log_path: P2, log_path: P2,
companies: &HashMap<String, CompanyCrossPlatformData>, companies: &HashMap<String, CompanyData>,
) -> Result<()> ) -> Result<()>
where where
P1: AsRef<Path>, P1: AsRef<Path>,

View File

@@ -79,14 +79,16 @@ pub fn choose_random<T: Clone>(items: &[T]) -> T {
} }
/// Extract first valid Yahoo ticker from company /// Extract first valid Yahoo ticker from company
pub fn extract_first_yahoo_ticker(company: &CompanyCrossPlatformData) -> Option<String> { pub fn extract_first_yahoo_ticker(company: &CompanyData) -> Option<String> {
for tickers in company.isin_tickers_map.values() { if let Some(isin_tickers_map) = &company.isin_tickers_map {
for ticker in tickers { for tickers in isin_tickers_map.values() {
if ticker.starts_with("YAHOO:") for ticker in tickers {
&& ticker != "YAHOO:NO_RESULTS" if ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:ERROR" && ticker != "YAHOO:NO_RESULTS"
{ && ticker != "YAHOO:ERROR"
return Some(ticker.trim_start_matches("YAHOO:").to_string()); {
return Some(ticker.trim_start_matches("YAHOO:").to_string());
}
} }
} }
} }
@@ -109,7 +111,7 @@ pub fn sanitize_company_name(name: &str) -> String {
/// Load companies from JSONL file /// Load companies from JSONL file
pub async fn load_companies_from_jsonl( pub async fn load_companies_from_jsonl(
path: &std::path::Path path: &std::path::Path
) -> anyhow::Result<Vec<CompanyCrossPlatformData>> { ) -> anyhow::Result<Vec<CompanyData>> {
let content = tokio::fs::read_to_string(path).await?; let content = tokio::fs::read_to_string(path).await?;
let mut companies = Vec::new(); let mut companies = Vec::new();
@@ -117,7 +119,7 @@ pub async fn load_companies_from_jsonl(
if line.trim().is_empty() { if line.trim().is_empty() {
continue; continue;
} }
if let Ok(company) = serde_json::from_str::<CompanyCrossPlatformData>(line) { if let Ok(company) = serde_json::from_str::<CompanyData>(line) {
companies.push(company); companies.push(company);
} }
} }

View File

@@ -52,12 +52,14 @@ pub struct FigiData {
/// * Name as primary key (for one institution) -> might have to changed when first FigiInfo is coming in /// * Name as primary key (for one institution) -> might have to changed when first FigiInfo is coming in
/// * ISIN as the most liquid / preferred traded security (used for fallback) /// * ISIN as the most liquid / preferred traded security (used for fallback)
/// * securities: Grouped by ISIN, filtered for Common Stock only /// * securities: Grouped by ISIN, filtered for Common Stock only
/// * isin_tickers_map: Map of ISINs to their associated tickers across platforms
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyData{ pub struct CompanyData{
pub name: String, pub name: String,
pub primary_isin: String, pub primary_isin: String,
pub securities: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo> pub securities: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo>
pub yahoo_company_data: Option<Vec<YahooCompanyData>>, pub yahoo_company_data: Option<Vec<YahooCompanyData>>,
pub isin_tickers_map: Option<HashMap<String, Vec<String>>>, // ISIN -> Tickers
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -67,14 +69,6 @@ pub struct YahooCompanyData {
pub exchange: Option<String>, pub exchange: Option<String>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyCrossPlatformData {
pub name: String,
pub isin_tickers_map: HashMap<String, Vec<String>>, // ISIN -> Tickers
pub sector: Option<String>,
pub exchange: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WarrantData { pub struct WarrantData {
pub company_name: String, // key in CompanyData pub company_name: String, // key in CompanyData

View File

@@ -20,14 +20,14 @@ use anyhow::{anyhow, Result};
/// Represents a write command to be serialized through the log writer /// Represents a write command to be serialized through the log writer
enum LogCommand { enum LogCommand {
Write(CompanyCrossPlatformData), Write(CompanyData),
Checkpoint, Checkpoint,
Shutdown, Shutdown,
} }
/// Result from processing a single company /// Result from processing a single company
struct CompanyProcessResult { struct CompanyProcessResult {
company: CompanyCrossPlatformData, company: CompanyData,
is_update: bool, is_update: bool,
} }
@@ -36,7 +36,7 @@ struct CompanyProcessResult {
fn company_needs_processing( fn company_needs_processing(
company_name: &str, company_name: &str,
company_info: &CompanyData, company_info: &CompanyData,
existing_companies: &HashMap<String, CompanyCrossPlatformData>, existing_companies: &HashMap<String, CompanyData>,
) -> bool { ) -> bool {
// If company not in existing data at all, definitely needs processing // If company not in existing data at all, definitely needs processing
let Some(existing_entry) = existing_companies.get(company_name) else { let Some(existing_entry) = existing_companies.get(company_name) else {
@@ -56,20 +56,25 @@ fn company_needs_processing(
// Check each required ISIN // Check each required ISIN
for isin in required_isins { for isin in required_isins {
// Check if this ISIN exists in the company's ticker map // Check if this ISIN exists in the company's ticker map
if let Some(tickers) = existing_entry.isin_tickers_map.get(&isin) { if let Some(map) = &existing_entry.isin_tickers_map {
// Check if this ISIN has valid Yahoo data if let Some(tickers) = map.get(&isin) {
let has_valid_yahoo = tickers.iter().any(|t| { // Check if this ISIN has valid Yahoo data
t.starts_with("YAHOO:") && let has_valid_yahoo = tickers.iter().any(|t| {
t != "YAHOO:ERROR" //&& // Error marker means needs retry t.starts_with("YAHOO:") &&
//t != "YAHOO:NO_RESULTS" // This is actually valid (legitimately not found) t != "YAHOO:ERROR" //&& // Error marker means needs retry
}); //t != "YAHOO:NO_RESULTS" // This is actually valid (legitimately not found)
});
// If no valid Yahoo data for this ISIN, company needs processing // If no valid Yahoo data for this ISIN, company needs processing
if !has_valid_yahoo { if !has_valid_yahoo {
return true;
}
} else {
// ISIN not in map at all, needs processing
return true; return true;
} }
} else { } else {
// ISIN not in map at all, needs processing // No isin_tickers_map at all, needs processing
return true; return true;
} }
} }
@@ -731,7 +736,7 @@ async fn scrape_with_retry(
async fn process_single_company_validated( async fn process_single_company_validated(
name: String, name: String,
company_info: CompanyData, company_info: CompanyData,
existing_entry: Option<CompanyCrossPlatformData>, existing_entry: Option<CompanyData>,
pool: &Arc<ChromeDriverPool>, pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>, shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<CompanyProcessResult>> { ) -> anyhow::Result<Option<CompanyProcessResult>> {
@@ -746,12 +751,9 @@ async fn process_single_company_validated(
let mut isin_tickers_map: HashMap<String, Vec<String>> = let mut isin_tickers_map: HashMap<String, Vec<String>> =
existing_entry existing_entry
.as_ref() .as_ref()
.map(|e| e.isin_tickers_map.clone()) .and_then(|e| e.isin_tickers_map.clone())
.unwrap_or_default(); .unwrap_or_default();
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
// Collect unique ISIN-ticker pairs // Collect unique ISIN-ticker pairs
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new(); let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
@@ -808,16 +810,6 @@ async fn process_single_company_validated(
)).await; )).await;
tickers.push(format!("YAHOO:{}", details.ticker)); tickers.push(format!("YAHOO:{}", details.ticker));
if sector.is_none() && details.sector.is_some() {
sector = details.sector.clone();
logger::log_info(&format!(" Sector: {}", details.sector.as_ref().unwrap())).await;
}
if exchange.is_none() && details.exchange.is_some() {
exchange = details.exchange.clone();
logger::log_info(&format!(" Exchange: {}", details.exchange.as_ref().unwrap())).await;
}
}, },
Ok(None) => { Ok(None) => {
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await; logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
@@ -866,11 +858,12 @@ async fn process_single_company_validated(
} }
if !isin_tickers_map.is_empty() { if !isin_tickers_map.is_empty() {
let company_entry = CompanyCrossPlatformData { let company_entry = CompanyData {
name: name.clone(), name: name.clone(),
isin_tickers_map, primary_isin: company_info.primary_isin.clone(),
sector, securities: company_info.securities.clone(),
exchange, yahoo_company_data: company_info.yahoo_company_data.clone(),
isin_tickers_map: Some(isin_tickers_map),
}; };
Ok(Some(CompanyProcessResult { Ok(Some(CompanyProcessResult {

View File

@@ -20,15 +20,15 @@ use tokio::sync::mpsc;
/// Result of processing a single company /// Result of processing a single company
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub enum CompanyProcessResult { pub enum CompanyProcessResult {
Valid(CompanyCrossPlatformData), Valid(CompanyData),
FilteredLowCap { name: String, market_cap: f64 }, FilteredLowCap { name: String, market_cap: f64 },
FilteredNoPrice { name: String }, FilteredNoPrice { name: String },
Failed { company: CompanyCrossPlatformData, error: String, is_transient: bool }, Failed { company: CompanyData, error: String, is_transient: bool },
} }
/// Represents a write command to be serialized through the log writer /// Represents a write command to be serialized through the log writer
enum LogCommand { enum LogCommand {
Write(CompanyCrossPlatformData), Write(CompanyData),
Checkpoint, Checkpoint,
Shutdown, Shutdown,
} }
@@ -81,7 +81,7 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
total_count += 1; total_count += 1;
let company: CompanyCrossPlatformData = match serde_json::from_str(&line) { let company: CompanyData = match serde_json::from_str(&line) {
Ok(c) => c, Ok(c) => c,
Err(e) => { Err(e) => {
logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await; logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await;
@@ -90,13 +90,17 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
}; };
let has_valid_yahoo = company.isin_tickers_map let has_valid_yahoo = company.isin_tickers_map
.values() .as_ref()
.flatten() .map(|map| {
.any(|ticker| { map.values()
ticker.starts_with("YAHOO:") .flatten()
&& ticker != "YAHOO:NO_RESULTS" .any(|ticker| {
&& ticker != "YAHOO:ERROR" ticker.starts_with("YAHOO:")
}); && ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
})
})
.unwrap_or(false);
if has_valid_yahoo { if has_valid_yahoo {
let json_line = serde_json::to_string(&company)?; let json_line = serde_json::to_string(&company)?;
@@ -194,7 +198,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
logger::log_info(" Cleansing companies with low Yahoo profile...").await; logger::log_info(" Cleansing companies with low Yahoo profile...").await;
// === RECOVERY PHASE: Load checkpoint + replay log === // === RECOVERY PHASE: Load checkpoint + replay log ===
let mut existing_companies: HashMap<String, CompanyCrossPlatformData> = HashMap::new(); let mut existing_companies: HashMap<String, CompanyData> = HashMap::new();
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new(); let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
if checkpoint_path.exists() { if checkpoint_path.exists() {
@@ -206,7 +210,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
continue; // Skip incomplete lines continue; // Skip incomplete lines
} }
match serde_json::from_str::<CompanyCrossPlatformData>(line) { match serde_json::from_str::<CompanyData>(line) {
Ok(company) => { Ok(company) => {
processed_names.insert(company.name.clone()); processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company); existing_companies.insert(company.name.clone(), company);
@@ -229,7 +233,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
continue; // Skip incomplete lines continue; // Skip incomplete lines
} }
match serde_json::from_str::<CompanyCrossPlatformData>(line) { match serde_json::from_str::<CompanyData>(line) {
Ok(company) => { Ok(company) => {
processed_names.insert(company.name.clone()); processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company); existing_companies.insert(company.name.clone(), company);
@@ -251,7 +255,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
logger::log_info(&format!("Loaded {} companies from input", input_companies.len())).await; logger::log_info(&format!("Loaded {} companies from input", input_companies.len())).await;
// === BUILD PENDING LIST (smart skip logic) === // === BUILD PENDING LIST (smart skip logic) ===
let mut pending: Vec<CompanyCrossPlatformData> = input_companies let mut pending: Vec<CompanyData> = input_companies
.into_iter() .into_iter()
.filter(|company| company_needs_processing(company, &existing_companies)) .filter(|company| company_needs_processing(company, &existing_companies))
.collect(); .collect();
@@ -608,7 +612,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
/// Helper function to spawn a validation task (reduces code duplication) /// Helper function to spawn a validation task (reduces code duplication)
fn spawn_validation_task( fn spawn_validation_task(
company: CompanyCrossPlatformData, company: CompanyData,
yahoo_pool: &Arc<YahooClientPool>, yahoo_pool: &Arc<YahooClientPool>,
paths: &Arc<DataPaths>, paths: &Arc<DataPaths>,
write_tx: &mpsc::Sender<LogCommand>, write_tx: &mpsc::Sender<LogCommand>,
@@ -688,7 +692,7 @@ fn spawn_validation_task(
/// Process a single company with full error categorization /// Process a single company with full error categorization
async fn process_company_with_validation( async fn process_company_with_validation(
company: &CompanyCrossPlatformData, company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>, yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths, paths: &DataPaths,
) -> CompanyProcessResult { ) -> CompanyProcessResult {
@@ -897,8 +901,8 @@ async fn save_company_core_data(
/// Check if a company needs processing (validation check) /// Check if a company needs processing (validation check)
fn company_needs_processing( fn company_needs_processing(
company: &CompanyCrossPlatformData, company: &CompanyData,
existing_companies: &HashMap<String, CompanyCrossPlatformData>, existing_companies: &HashMap<String, CompanyData>,
) -> bool { ) -> bool {
// If company exists in cleaned output, skip it // If company exists in cleaned output, skip it
!existing_companies.contains_key(&company.name) !existing_companies.contains_key(&company.name)

View File

@@ -29,7 +29,7 @@ enum LogCommand {
/// Type alias for enrichment function /// Type alias for enrichment function
type EnrichmentFn = Arc< type EnrichmentFn = Arc<
dyn Fn(CompanyCrossPlatformData, Arc<YahooClientPool>, DataPaths) dyn Fn(CompanyData, Arc<YahooClientPool>, DataPaths)
-> Pin<Box<dyn Future<Output = anyhow::Result<()>> + Send>> -> Pin<Box<dyn Future<Output = anyhow::Result<()>> + Send>>
+ Send + Send
+ Sync + Sync
@@ -104,7 +104,7 @@ pub async fn enrich_companies_with_events(
logger::log_info(&format!("Found {} companies to process", total_companies)).await; logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment // Filter companies that need enrichment
let pending_companies: Vec<CompanyCrossPlatformData> = companies let pending_companies: Vec<CompanyData> = companies
.into_iter() .into_iter()
.filter(|company| !enriched_companies.contains(&company.name)) .filter(|company| !enriched_companies.contains(&company.name))
.collect(); .collect();
@@ -283,7 +283,7 @@ async fn track_events_completion(
/// Enrich a single company with event data /// Enrich a single company with event data
async fn enrich_company_with_events( async fn enrich_company_with_events(
company: &CompanyCrossPlatformData, company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>, yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths, paths: &DataPaths,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
@@ -438,7 +438,7 @@ pub async fn enrich_companies_with_option(
logger::log_info(&format!("Found {} companies to process", total_companies)).await; logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment // Filter companies that need enrichment
let pending_companies: Vec<CompanyCrossPlatformData> = companies let pending_companies: Vec<CompanyData> = companies
.into_iter() .into_iter()
.filter(|company| !enriched_companies.contains(&company.name)) .filter(|company| !enriched_companies.contains(&company.name))
.collect(); .collect();
@@ -605,7 +605,7 @@ async fn track_option_completion(
/// Enrich a single company with option data /// Enrich a single company with option data
async fn enrich_company_with_option( async fn enrich_company_with_option(
company: &CompanyCrossPlatformData, company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>, yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths, paths: &DataPaths,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
@@ -697,7 +697,7 @@ pub async fn enrich_companies_with_chart(
logger::log_info(&format!("Found {} companies to process", total_companies)).await; logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment // Filter companies that need enrichment
let pending_companies: Vec<CompanyCrossPlatformData> = companies let pending_companies: Vec<CompanyData> = companies
.into_iter() .into_iter()
.filter(|company| !enriched_companies.contains(&company.name)) .filter(|company| !enriched_companies.contains(&company.name))
.collect(); .collect();
@@ -864,7 +864,7 @@ async fn track_chart_completion(
/// Enrich a single company with chart data /// Enrich a single company with chart data
async fn enrich_company_with_chart( async fn enrich_company_with_chart(
company: &CompanyCrossPlatformData, company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>, yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths, paths: &DataPaths,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
@@ -1005,7 +1005,7 @@ fn spawn_log_writer(
/// - `shutdown_flag`: Flag to signal shutdown /// - `shutdown_flag`: Flag to signal shutdown
/// - `enrichment_fn`: The specific enrichment function to call (events, option, chart, etc.) /// - `enrichment_fn`: The specific enrichment function to call (events, option, chart, etc.)
fn spawn_enrichment_task( fn spawn_enrichment_task(
company: CompanyCrossPlatformData, company: CompanyData,
yahoo_pool: Arc<YahooClientPool>, yahoo_pool: Arc<YahooClientPool>,
paths: DataPaths, paths: DataPaths,
processed_count: Arc<AtomicUsize>, processed_count: Arc<AtomicUsize>,

View File

@@ -655,6 +655,7 @@ fn prepare_common_stock_entry(
primary_isin, primary_isin,
securities: grouped_by_isin, securities: grouped_by_isin,
yahoo_company_data: None, yahoo_company_data: None,
isin_tickers_map: None,
}) })
} }

View File

@@ -303,9 +303,11 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
let content = tokio::fs::read_to_string(companies_file).await?; let content = tokio::fs::read_to_string(companies_file).await?;
let mut tickers = Vec::new(); let mut tickers = Vec::new();
for line in content.lines() { for line in content.lines() {
let company: CompanyCrossPlatformData = serde_json::from_str(line)?; let company: CompanyData = serde_json::from_str(line)?;
for (_isin, ticker_vec) in company.isin_tickers_map { if let Some(isin_tickers_map) = company.isin_tickers_map {
tickers.extend(ticker_vec); for (_isin, ticker_vec) in isin_tickers_map {
tickers.extend(ticker_vec);
}
} }
} }
Ok(tickers) Ok(tickers)