validation of scraped data

This commit is contained in:
2025-11-16 16:34:22 +01:00
parent 59aad09f71
commit 0853124918
2 changed files with 22157 additions and 16 deletions

22002
economic_events.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,10 +1,10 @@
use fantoccini::{ClientBuilder, Locator};
use serde::Serialize;
use serde_json::{Map, Value};
use std::{collections::HashMap, process::Command};
use tokio::{time::{Duration, sleep}, signal};
use futures::future::join_all;
#[derive(Debug)]
#[derive(Debug, Serialize, Clone)]
struct EconomicEvent {
country: String,
date: String,
@@ -192,6 +192,143 @@ async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyho
Ok(event_type_map)
}
async fn check_data_consistency(events: &[EconomicEvent]) {
println!("\n=== DATA CONSISTENCY CHECKS ===");
// Count event name occurrences
let mut event_names: HashMap<String, usize> = HashMap::new();
for event in events {
*event_names.entry(event.event.clone()).or_insert(0) += 1;
}
// Detect duplicates
let duplicates: Vec<_> = event_names
.iter()
.filter(|(_, count)| **count > 1)
.collect();
if !duplicates.is_empty() {
println!("⚠️ Found {} duplicate event names:", duplicates.len());
for (name, count) in duplicates.iter().take(5) {
println!(" - '{}' appears {} times", name, count);
}
} else {
println!("✅ No duplicate event names found");
}
// Check time format consistency
let valid_time_format = events.iter()
.filter(|e| {
// Time should be in format "HH:MM"
e.time.len() == 5 &&
e.time.chars().nth(2) == Some(':') &&
e.time[0..2].chars().all(|c| c.is_ascii_digit()) &&
e.time[3..5].chars().all(|c| c.is_ascii_digit())
})
.count();
println!("⏰ Valid time formats: {}/{}", valid_time_format, events.len());
// Check for missing critical data
let critical_fields_missing: Vec<_> = events.iter()
.enumerate()
.filter(|(_, e)| e.event.trim().is_empty() || e.time.trim().is_empty())
.map(|(i, e)| (i, e))
.collect();
if !critical_fields_missing.is_empty() {
println!("{} events missing critical fields", critical_fields_missing.len());
}
}
async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> {
println!("\n=== EVENT VALIDATION ===");
// Check if we have any events at all
if events.is_empty() {
println!("❌ ERROR: No events extracted!");
return Ok(());
}
println!("📊 Total events: {}", events.len());
// 1. Check date range compliance
let date_range_events: Vec<_> = events.iter()
.filter(|e| {
// Extract year from German date format "Dienstag, 2. Januar 2024"
e.date.contains("2024") || e.date.contains("2025")
})
.collect();
println!("📅 Events in 2024-2025 range: {}/{}",
date_range_events.len(), events.len());
// 2. Check importance filtering
let high_importance_count = events.iter()
.filter(|e| e.importance == "High")
.count();
println!("⭐ High importance events: {}/{}", high_importance_count, events.len());
// 3. Check data completeness
let complete_events = events.iter()
.filter(|e| {
!e.event.trim().is_empty() &&
!e.time.trim().is_empty() &&
!e.country.trim().is_empty() &&
(!e.actual.trim().is_empty() || !e.forecast.trim().is_empty() || !e.previous.trim().is_empty())
})
.count();
println!("✅ Complete events: {}/{}", complete_events, events.len());
// 4. Check description coverage
let events_with_descriptions = events.iter()
.filter(|e| !e.description.trim().is_empty())
.count();
println!("📝 Events with descriptions: {}/{}", events_with_descriptions, events.len());
// 5. Distribution analysis
use std::collections::HashMap;
let mut country_distribution: HashMap<String, usize> = HashMap::new();
let mut month_distribution: HashMap<String, usize> = HashMap::new();
for event in events {
*country_distribution.entry(event.country.clone()).or_insert(0) += 1;
// Extract month from German date
if let Some(month) = extract_month(&event.date) {
*month_distribution.entry(month).or_insert(0) += 1;
}
}
println!("🌍 Country distribution: {:?}", country_distribution);
println!("📈 Month distribution: {:?}", month_distribution);
// 6. Sample output for manual inspection
println!("\n🔍 Sample events (first 5):");
for event in events.iter().take(5) {
println!("{} {}: {} - {} (Importance: {})",
event.date, event.time, event.country, event.event, event.importance);
}
Ok(())
}
fn extract_month(date_str: &str) -> Option<String> {
// Extract month from German date format
let months = [
"Januar", "Februar", "März", "April", "Mai", "Juni",
"Juli", "August", "September", "Oktober", "November", "Dezember"
];
for month in months {
if date_str.contains(month) {
return Some(month.to_string());
}
}
None
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let port = 9515; // pick a port you like
@@ -400,7 +537,7 @@ async fn main() -> anyhow::Result<()> {
let event_type_map = extract_event_descriptions_via_js(&client).await?;
// Merge descriptions with events
let events_with_descriptions: Vec<EconomicEvent> = events.into_iter()
let events_with_descriptions: Vec<EconomicEvent> = events.clone().into_iter()
.map(|mut event| {
if let Some(description) = event_type_map.get(&event.event) {
event.description = description.clone();
@@ -409,20 +546,22 @@ async fn main() -> anyhow::Result<()> {
})
.collect();
println!("Final results:");
for event in &events_with_descriptions {
if !event.description.is_empty() {
println!("{}: {} chars of description",
event.event, event.description.len());
}
}
// Run validation suite
validate_events(&events).await?;
check_data_consistency(&events).await;
/*println!("Collected {} event descriptions", event_type_map.len());
for (k, v) in &event_type_map {
if !v.is_empty() {
println!("{:?} => {} chars", k, v.len());
}
}*/
// Final summary
println!("\n🎯 EXTRACTION SUMMARY:");
println!(" • Total high-importance events: {}", events.len());
println!(" • Date range: 2024-01-01 to 2025-01-01");
println!(" • Data quality: {}% complete",
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
// Export for further analysis
if let Ok(json) = serde_json::to_string_pretty(&events) {
tokio::fs::write("economic_events.json", json).await?;
println!(" • Data exported to: economic_events.json");
}
// Wait for Ctrl+C
shutdown_handle.await.ok();