removed crossplatformcompany from types

This commit is contained in:
2026-01-14 14:49:00 +01:00
parent 93fbefc9d4
commit f4b20f824d
8 changed files with 83 additions and 87 deletions

View File

@@ -4,7 +4,7 @@
//! This module extracts common patterns used across multiple update modules
//! to reduce code duplication and improve maintainability.
use super::types::CompanyCrossPlatformData;
use super::types::CompanyData;
use crate::util::logger;
use std::collections::HashMap;
use std::path::{Path};
@@ -22,7 +22,7 @@ pub async fn load_checkpoint_with_log<P1, P2>(
checkpoint_path: P1,
log_path: P2,
checkpoint_desc: &str,
) -> Result<HashMap<String, CompanyCrossPlatformData>>
) -> Result<HashMap<String, CompanyData>>
where
P1: AsRef<Path>,
P2: AsRef<Path>,
@@ -30,7 +30,7 @@ where
let checkpoint_path = checkpoint_path.as_ref();
let log_path = log_path.as_ref();
let mut companies: HashMap<String, CompanyCrossPlatformData> = HashMap::new();
let mut companies: HashMap<String, CompanyData> = HashMap::new();
// Load checkpoint if it exists
if checkpoint_path.exists() {
@@ -42,7 +42,7 @@ where
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyCrossPlatformData>(line) {
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
companies.insert(company.name.clone(), company);
}
@@ -65,7 +65,7 @@ where
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyCrossPlatformData>(line) {
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
companies.insert(company.name.clone(), company);
replayed += 1;
@@ -91,7 +91,7 @@ where
pub async fn consolidate_checkpoint<P1, P2>(
checkpoint_path: P1,
log_path: P2,
companies: &HashMap<String, CompanyCrossPlatformData>,
companies: &HashMap<String, CompanyData>,
) -> Result<()>
where
P1: AsRef<Path>,

View File

@@ -79,14 +79,16 @@ pub fn choose_random<T: Clone>(items: &[T]) -> T {
}
/// Extract first valid Yahoo ticker from company
pub fn extract_first_yahoo_ticker(company: &CompanyCrossPlatformData) -> Option<String> {
for tickers in company.isin_tickers_map.values() {
for ticker in tickers {
if ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
{
return Some(ticker.trim_start_matches("YAHOO:").to_string());
pub fn extract_first_yahoo_ticker(company: &CompanyData) -> Option<String> {
if let Some(isin_tickers_map) = &company.isin_tickers_map {
for tickers in isin_tickers_map.values() {
for ticker in tickers {
if ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
{
return Some(ticker.trim_start_matches("YAHOO:").to_string());
}
}
}
}
@@ -109,7 +111,7 @@ pub fn sanitize_company_name(name: &str) -> String {
/// Load companies from JSONL file
pub async fn load_companies_from_jsonl(
path: &std::path::Path
) -> anyhow::Result<Vec<CompanyCrossPlatformData>> {
) -> anyhow::Result<Vec<CompanyData>> {
let content = tokio::fs::read_to_string(path).await?;
let mut companies = Vec::new();
@@ -117,7 +119,7 @@ pub async fn load_companies_from_jsonl(
if line.trim().is_empty() {
continue;
}
if let Ok(company) = serde_json::from_str::<CompanyCrossPlatformData>(line) {
if let Ok(company) = serde_json::from_str::<CompanyData>(line) {
companies.push(company);
}
}

View File

@@ -52,12 +52,14 @@ pub struct FigiData {
/// * Name as primary key (for one institution) -> might have to changed when first FigiInfo is coming in
/// * ISIN as the most liquid / preferred traded security (used for fallback)
/// * securities: Grouped by ISIN, filtered for Common Stock only
/// * isin_tickers_map: Map of ISINs to their associated tickers across platforms
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyData{
pub name: String,
pub primary_isin: String,
pub securities: HashMap<String, Vec<FigiData>>, // ISIN -> Vec<FigiInfo>
pub yahoo_company_data: Option<Vec<YahooCompanyData>>,
pub isin_tickers_map: Option<HashMap<String, Vec<String>>>, // ISIN -> Tickers
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -67,14 +69,6 @@ pub struct YahooCompanyData {
pub exchange: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompanyCrossPlatformData {
pub name: String,
pub isin_tickers_map: HashMap<String, Vec<String>>, // ISIN -> Tickers
pub sector: Option<String>,
pub exchange: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WarrantData {
pub company_name: String, // key in CompanyData

View File

@@ -20,14 +20,14 @@ use anyhow::{anyhow, Result};
/// Represents a write command to be serialized through the log writer
enum LogCommand {
Write(CompanyCrossPlatformData),
Write(CompanyData),
Checkpoint,
Shutdown,
}
/// Result from processing a single company
struct CompanyProcessResult {
company: CompanyCrossPlatformData,
company: CompanyData,
is_update: bool,
}
@@ -36,7 +36,7 @@ struct CompanyProcessResult {
fn company_needs_processing(
company_name: &str,
company_info: &CompanyData,
existing_companies: &HashMap<String, CompanyCrossPlatformData>,
existing_companies: &HashMap<String, CompanyData>,
) -> bool {
// If company not in existing data at all, definitely needs processing
let Some(existing_entry) = existing_companies.get(company_name) else {
@@ -56,20 +56,25 @@ fn company_needs_processing(
// Check each required ISIN
for isin in required_isins {
// Check if this ISIN exists in the company's ticker map
if let Some(tickers) = existing_entry.isin_tickers_map.get(&isin) {
// Check if this ISIN has valid Yahoo data
let has_valid_yahoo = tickers.iter().any(|t| {
t.starts_with("YAHOO:") &&
t != "YAHOO:ERROR" //&& // Error marker means needs retry
//t != "YAHOO:NO_RESULTS" // This is actually valid (legitimately not found)
});
// If no valid Yahoo data for this ISIN, company needs processing
if !has_valid_yahoo {
if let Some(map) = &existing_entry.isin_tickers_map {
if let Some(tickers) = map.get(&isin) {
// Check if this ISIN has valid Yahoo data
let has_valid_yahoo = tickers.iter().any(|t| {
t.starts_with("YAHOO:") &&
t != "YAHOO:ERROR" //&& // Error marker means needs retry
//t != "YAHOO:NO_RESULTS" // This is actually valid (legitimately not found)
});
// If no valid Yahoo data for this ISIN, company needs processing
if !has_valid_yahoo {
return true;
}
} else {
// ISIN not in map at all, needs processing
return true;
}
} else {
// ISIN not in map at all, needs processing
// No isin_tickers_map at all, needs processing
return true;
}
}
@@ -731,7 +736,7 @@ async fn scrape_with_retry(
async fn process_single_company_validated(
name: String,
company_info: CompanyData,
existing_entry: Option<CompanyCrossPlatformData>,
existing_entry: Option<CompanyData>,
pool: &Arc<ChromeDriverPool>,
shutdown_flag: &Arc<AtomicBool>,
) -> anyhow::Result<Option<CompanyProcessResult>> {
@@ -746,12 +751,9 @@ async fn process_single_company_validated(
let mut isin_tickers_map: HashMap<String, Vec<String>> =
existing_entry
.as_ref()
.map(|e| e.isin_tickers_map.clone())
.and_then(|e| e.isin_tickers_map.clone())
.unwrap_or_default();
let mut sector = existing_entry.as_ref().and_then(|e| e.sector.clone());
let mut exchange = existing_entry.as_ref().and_then(|e| e.exchange.clone());
// Collect unique ISIN-ticker pairs
let mut unique_isin_ticker_pairs: HashMap<String, Vec<String>> = HashMap::new();
@@ -808,16 +810,6 @@ async fn process_single_company_validated(
)).await;
tickers.push(format!("YAHOO:{}", details.ticker));
if sector.is_none() && details.sector.is_some() {
sector = details.sector.clone();
logger::log_info(&format!(" Sector: {}", details.sector.as_ref().unwrap())).await;
}
if exchange.is_none() && details.exchange.is_some() {
exchange = details.exchange.clone();
logger::log_info(&format!(" Exchange: {}", details.exchange.as_ref().unwrap())).await;
}
},
Ok(None) => {
logger::log_warn(&format!("◯ No search results for ISIN {} (company: {})", isin, name)).await;
@@ -866,11 +858,12 @@ async fn process_single_company_validated(
}
if !isin_tickers_map.is_empty() {
let company_entry = CompanyCrossPlatformData {
let company_entry = CompanyData {
name: name.clone(),
isin_tickers_map,
sector,
exchange,
primary_isin: company_info.primary_isin.clone(),
securities: company_info.securities.clone(),
yahoo_company_data: company_info.yahoo_company_data.clone(),
isin_tickers_map: Some(isin_tickers_map),
};
Ok(Some(CompanyProcessResult {

View File

@@ -20,15 +20,15 @@ use tokio::sync::mpsc;
/// Result of processing a single company
#[derive(Debug, Clone)]
pub enum CompanyProcessResult {
Valid(CompanyCrossPlatformData),
Valid(CompanyData),
FilteredLowCap { name: String, market_cap: f64 },
FilteredNoPrice { name: String },
Failed { company: CompanyCrossPlatformData, error: String, is_transient: bool },
Failed { company: CompanyData, error: String, is_transient: bool },
}
/// Represents a write command to be serialized through the log writer
enum LogCommand {
Write(CompanyCrossPlatformData),
Write(CompanyData),
Checkpoint,
Shutdown,
}
@@ -81,7 +81,7 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
total_count += 1;
let company: CompanyCrossPlatformData = match serde_json::from_str(&line) {
let company: CompanyData = match serde_json::from_str(&line) {
Ok(c) => c,
Err(e) => {
logger::log_warn(&format!(" Failed to parse company on line {}: {}", total_count, e)).await;
@@ -90,13 +90,17 @@ pub async fn companies_yahoo_cleansed_no_data(paths: &DataPaths) -> Result<usize
};
let has_valid_yahoo = company.isin_tickers_map
.values()
.flatten()
.any(|ticker| {
ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
});
.as_ref()
.map(|map| {
map.values()
.flatten()
.any(|ticker| {
ticker.starts_with("YAHOO:")
&& ticker != "YAHOO:NO_RESULTS"
&& ticker != "YAHOO:ERROR"
})
})
.unwrap_or(false);
if has_valid_yahoo {
let json_line = serde_json::to_string(&company)?;
@@ -194,7 +198,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
logger::log_info(" Cleansing companies with low Yahoo profile...").await;
// === RECOVERY PHASE: Load checkpoint + replay log ===
let mut existing_companies: HashMap<String, CompanyCrossPlatformData> = HashMap::new();
let mut existing_companies: HashMap<String, CompanyData> = HashMap::new();
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
if checkpoint_path.exists() {
@@ -206,7 +210,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyCrossPlatformData>(line) {
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
@@ -229,7 +233,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
continue; // Skip incomplete lines
}
match serde_json::from_str::<CompanyCrossPlatformData>(line) {
match serde_json::from_str::<CompanyData>(line) {
Ok(company) => {
processed_names.insert(company.name.clone());
existing_companies.insert(company.name.clone(), company);
@@ -251,7 +255,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
logger::log_info(&format!("Loaded {} companies from input", input_companies.len())).await;
// === BUILD PENDING LIST (smart skip logic) ===
let mut pending: Vec<CompanyCrossPlatformData> = input_companies
let mut pending: Vec<CompanyData> = input_companies
.into_iter()
.filter(|company| company_needs_processing(company, &existing_companies))
.collect();
@@ -608,7 +612,7 @@ pub async fn companies_yahoo_cleansed_low_profile(
/// Helper function to spawn a validation task (reduces code duplication)
fn spawn_validation_task(
company: CompanyCrossPlatformData,
company: CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &Arc<DataPaths>,
write_tx: &mpsc::Sender<LogCommand>,
@@ -688,7 +692,7 @@ fn spawn_validation_task(
/// Process a single company with full error categorization
async fn process_company_with_validation(
company: &CompanyCrossPlatformData,
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> CompanyProcessResult {
@@ -897,8 +901,8 @@ async fn save_company_core_data(
/// Check if a company needs processing (validation check)
fn company_needs_processing(
company: &CompanyCrossPlatformData,
existing_companies: &HashMap<String, CompanyCrossPlatformData>,
company: &CompanyData,
existing_companies: &HashMap<String, CompanyData>,
) -> bool {
// If company exists in cleaned output, skip it
!existing_companies.contains_key(&company.name)

View File

@@ -29,7 +29,7 @@ enum LogCommand {
/// Type alias for enrichment function
type EnrichmentFn = Arc<
dyn Fn(CompanyCrossPlatformData, Arc<YahooClientPool>, DataPaths)
dyn Fn(CompanyData, Arc<YahooClientPool>, DataPaths)
-> Pin<Box<dyn Future<Output = anyhow::Result<()>> + Send>>
+ Send
+ Sync
@@ -104,7 +104,7 @@ pub async fn enrich_companies_with_events(
logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment
let pending_companies: Vec<CompanyCrossPlatformData> = companies
let pending_companies: Vec<CompanyData> = companies
.into_iter()
.filter(|company| !enriched_companies.contains(&company.name))
.collect();
@@ -283,7 +283,7 @@ async fn track_events_completion(
/// Enrich a single company with event data
async fn enrich_company_with_events(
company: &CompanyCrossPlatformData,
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> anyhow::Result<()> {
@@ -438,7 +438,7 @@ pub async fn enrich_companies_with_option(
logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment
let pending_companies: Vec<CompanyCrossPlatformData> = companies
let pending_companies: Vec<CompanyData> = companies
.into_iter()
.filter(|company| !enriched_companies.contains(&company.name))
.collect();
@@ -605,7 +605,7 @@ async fn track_option_completion(
/// Enrich a single company with option data
async fn enrich_company_with_option(
company: &CompanyCrossPlatformData,
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> anyhow::Result<()> {
@@ -697,7 +697,7 @@ pub async fn enrich_companies_with_chart(
logger::log_info(&format!("Found {} companies to process", total_companies)).await;
// Filter companies that need enrichment
let pending_companies: Vec<CompanyCrossPlatformData> = companies
let pending_companies: Vec<CompanyData> = companies
.into_iter()
.filter(|company| !enriched_companies.contains(&company.name))
.collect();
@@ -864,7 +864,7 @@ async fn track_chart_completion(
/// Enrich a single company with chart data
async fn enrich_company_with_chart(
company: &CompanyCrossPlatformData,
company: &CompanyData,
yahoo_pool: &Arc<YahooClientPool>,
paths: &DataPaths,
) -> anyhow::Result<()> {
@@ -1005,7 +1005,7 @@ fn spawn_log_writer(
/// - `shutdown_flag`: Flag to signal shutdown
/// - `enrichment_fn`: The specific enrichment function to call (events, option, chart, etc.)
fn spawn_enrichment_task(
company: CompanyCrossPlatformData,
company: CompanyData,
yahoo_pool: Arc<YahooClientPool>,
paths: DataPaths,
processed_count: Arc<AtomicUsize>,

View File

@@ -655,6 +655,7 @@ fn prepare_common_stock_entry(
primary_isin,
securities: grouped_by_isin,
yahoo_company_data: None,
isin_tickers_map: None,
})
}

View File

@@ -303,9 +303,11 @@ pub async fn get_all_tickers_from_companies_jsonl(paths: &DataPaths) -> anyhow::
let content = tokio::fs::read_to_string(companies_file).await?;
let mut tickers = Vec::new();
for line in content.lines() {
let company: CompanyCrossPlatformData = serde_json::from_str(line)?;
for (_isin, ticker_vec) in company.isin_tickers_map {
tickers.extend(ticker_vec);
let company: CompanyData = serde_json::from_str(line)?;
if let Some(isin_tickers_map) = company.isin_tickers_map {
for (_isin, ticker_vec) in isin_tickers_map {
tickers.extend(ticker_vec);
}
}
}
Ok(tickers)