added helper functions to reduce bloat
This commit is contained in:
@@ -4,6 +4,7 @@ use crate::util::directories::DataPaths;
|
||||
use crate::util::logger;
|
||||
use crate::scraper::webdriver::ChromeDriverPool;
|
||||
use crate::scraper::hard_reset::perform_hard_reset;
|
||||
use crate::corporate::checkpoint_helpers;
|
||||
use crate::config::Config;
|
||||
|
||||
use tokio::sync::mpsc;
|
||||
@@ -120,56 +121,11 @@ pub async fn build_companies_jsonl_streaming_parallel(
|
||||
}
|
||||
|
||||
// === RECOVERY PHASE: Load checkpoint + replay log ===
|
||||
let mut existing_companies: HashMap<String, CompanyCrossPlatformInfo> = HashMap::new();
|
||||
let mut processed_names: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||
|
||||
if companies_path.exists() {
|
||||
logger::log_info("Loading checkpoint from companies.jsonl...").await;
|
||||
let existing_content = tokio::fs::read_to_string(&companies_path).await?;
|
||||
|
||||
for line in existing_content.lines() {
|
||||
if line.trim().is_empty() || !line.ends_with('}') {
|
||||
continue; // Skip incomplete lines
|
||||
}
|
||||
|
||||
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||
Ok(company) => {
|
||||
processed_names.insert(company.name.clone());
|
||||
existing_companies.insert(company.name.clone(), company);
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_warn(&format!("Skipping invalid checkpoint line: {}", e)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger::log_info(&format!("Loaded checkpoint with {} companies", existing_companies.len())).await;
|
||||
}
|
||||
|
||||
if log_path.exists() {
|
||||
logger::log_info("Replaying update log...").await;
|
||||
let log_content = tokio::fs::read_to_string(&log_path).await?;
|
||||
let mut replayed = 0;
|
||||
|
||||
for line in log_content.lines() {
|
||||
if line.trim().is_empty() || !line.ends_with('}') {
|
||||
continue; // Skip incomplete lines
|
||||
}
|
||||
|
||||
match serde_json::from_str::<CompanyCrossPlatformInfo>(line) {
|
||||
Ok(company) => {
|
||||
processed_names.insert(company.name.clone());
|
||||
existing_companies.insert(company.name.clone(), company);
|
||||
replayed += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
logger::log_warn(&format!("Skipping invalid log line: {}", e)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
if replayed > 0 {
|
||||
logger::log_info(&format!("Replayed {} updates from log", replayed)).await;
|
||||
}
|
||||
}
|
||||
let existing_companies = checkpoint_helpers::load_checkpoint_with_log(
|
||||
&companies_path,
|
||||
&log_path,
|
||||
"companies.jsonl"
|
||||
).await?;
|
||||
|
||||
// === SETUP LOG WRITER TASK ===
|
||||
let (write_tx, mut write_rx) = mpsc::channel::<LogCommand>(1000);
|
||||
|
||||
Reference in New Issue
Block a user