validation of scraped data
This commit is contained in:
22002
economic_events.json
Normal file
22002
economic_events.json
Normal file
File diff suppressed because it is too large
Load Diff
171
src/main.rs
171
src/main.rs
@@ -1,10 +1,10 @@
|
||||
use fantoccini::{ClientBuilder, Locator};
|
||||
use serde::Serialize;
|
||||
use serde_json::{Map, Value};
|
||||
use std::{collections::HashMap, process::Command};
|
||||
use tokio::{time::{Duration, sleep}, signal};
|
||||
use futures::future::join_all;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
struct EconomicEvent {
|
||||
country: String,
|
||||
date: String,
|
||||
@@ -192,6 +192,143 @@ async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyho
|
||||
Ok(event_type_map)
|
||||
}
|
||||
|
||||
async fn check_data_consistency(events: &[EconomicEvent]) {
|
||||
println!("\n=== DATA CONSISTENCY CHECKS ===");
|
||||
|
||||
// Count event name occurrences
|
||||
let mut event_names: HashMap<String, usize> = HashMap::new();
|
||||
for event in events {
|
||||
*event_names.entry(event.event.clone()).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
// Detect duplicates
|
||||
let duplicates: Vec<_> = event_names
|
||||
.iter()
|
||||
.filter(|(_, count)| **count > 1)
|
||||
.collect();
|
||||
|
||||
if !duplicates.is_empty() {
|
||||
println!("⚠️ Found {} duplicate event names:", duplicates.len());
|
||||
for (name, count) in duplicates.iter().take(5) {
|
||||
println!(" - '{}' appears {} times", name, count);
|
||||
}
|
||||
} else {
|
||||
println!("✅ No duplicate event names found");
|
||||
}
|
||||
|
||||
// Check time format consistency
|
||||
let valid_time_format = events.iter()
|
||||
.filter(|e| {
|
||||
// Time should be in format "HH:MM"
|
||||
e.time.len() == 5 &&
|
||||
e.time.chars().nth(2) == Some(':') &&
|
||||
e.time[0..2].chars().all(|c| c.is_ascii_digit()) &&
|
||||
e.time[3..5].chars().all(|c| c.is_ascii_digit())
|
||||
})
|
||||
.count();
|
||||
|
||||
println!("⏰ Valid time formats: {}/{}", valid_time_format, events.len());
|
||||
|
||||
// Check for missing critical data
|
||||
let critical_fields_missing: Vec<_> = events.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, e)| e.event.trim().is_empty() || e.time.trim().is_empty())
|
||||
.map(|(i, e)| (i, e))
|
||||
.collect();
|
||||
|
||||
if !critical_fields_missing.is_empty() {
|
||||
println!("❌ {} events missing critical fields", critical_fields_missing.len());
|
||||
}
|
||||
}
|
||||
|
||||
async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> {
|
||||
println!("\n=== EVENT VALIDATION ===");
|
||||
|
||||
// Check if we have any events at all
|
||||
if events.is_empty() {
|
||||
println!("❌ ERROR: No events extracted!");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("📊 Total events: {}", events.len());
|
||||
|
||||
// 1. Check date range compliance
|
||||
let date_range_events: Vec<_> = events.iter()
|
||||
.filter(|e| {
|
||||
// Extract year from German date format "Dienstag, 2. Januar 2024"
|
||||
e.date.contains("2024") || e.date.contains("2025")
|
||||
})
|
||||
.collect();
|
||||
|
||||
println!("📅 Events in 2024-2025 range: {}/{}",
|
||||
date_range_events.len(), events.len());
|
||||
|
||||
// 2. Check importance filtering
|
||||
let high_importance_count = events.iter()
|
||||
.filter(|e| e.importance == "High")
|
||||
.count();
|
||||
println!("⭐ High importance events: {}/{}", high_importance_count, events.len());
|
||||
|
||||
// 3. Check data completeness
|
||||
let complete_events = events.iter()
|
||||
.filter(|e| {
|
||||
!e.event.trim().is_empty() &&
|
||||
!e.time.trim().is_empty() &&
|
||||
!e.country.trim().is_empty() &&
|
||||
(!e.actual.trim().is_empty() || !e.forecast.trim().is_empty() || !e.previous.trim().is_empty())
|
||||
})
|
||||
.count();
|
||||
|
||||
println!("✅ Complete events: {}/{}", complete_events, events.len());
|
||||
|
||||
// 4. Check description coverage
|
||||
let events_with_descriptions = events.iter()
|
||||
.filter(|e| !e.description.trim().is_empty())
|
||||
.count();
|
||||
println!("📝 Events with descriptions: {}/{}", events_with_descriptions, events.len());
|
||||
|
||||
// 5. Distribution analysis
|
||||
use std::collections::HashMap;
|
||||
let mut country_distribution: HashMap<String, usize> = HashMap::new();
|
||||
let mut month_distribution: HashMap<String, usize> = HashMap::new();
|
||||
|
||||
for event in events {
|
||||
*country_distribution.entry(event.country.clone()).or_insert(0) += 1;
|
||||
|
||||
// Extract month from German date
|
||||
if let Some(month) = extract_month(&event.date) {
|
||||
*month_distribution.entry(month).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
println!("🌍 Country distribution: {:?}", country_distribution);
|
||||
println!("📈 Month distribution: {:?}", month_distribution);
|
||||
|
||||
// 6. Sample output for manual inspection
|
||||
println!("\n🔍 Sample events (first 5):");
|
||||
for event in events.iter().take(5) {
|
||||
println!(" • {} {}: {} - {} (Importance: {})",
|
||||
event.date, event.time, event.country, event.event, event.importance);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn extract_month(date_str: &str) -> Option<String> {
|
||||
// Extract month from German date format
|
||||
let months = [
|
||||
"Januar", "Februar", "März", "April", "Mai", "Juni",
|
||||
"Juli", "August", "September", "Oktober", "November", "Dezember"
|
||||
];
|
||||
|
||||
for month in months {
|
||||
if date_str.contains(month) {
|
||||
return Some(month.to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let port = 9515; // pick a port you like
|
||||
@@ -400,7 +537,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
||||
|
||||
// Merge descriptions with events
|
||||
let events_with_descriptions: Vec<EconomicEvent> = events.into_iter()
|
||||
let events_with_descriptions: Vec<EconomicEvent> = events.clone().into_iter()
|
||||
.map(|mut event| {
|
||||
if let Some(description) = event_type_map.get(&event.event) {
|
||||
event.description = description.clone();
|
||||
@@ -409,20 +546,22 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.collect();
|
||||
|
||||
println!("Final results:");
|
||||
for event in &events_with_descriptions {
|
||||
if !event.description.is_empty() {
|
||||
println!("{}: {} chars of description",
|
||||
event.event, event.description.len());
|
||||
}
|
||||
}
|
||||
// Run validation suite
|
||||
validate_events(&events).await?;
|
||||
check_data_consistency(&events).await;
|
||||
|
||||
/*println!("Collected {} event descriptions", event_type_map.len());
|
||||
for (k, v) in &event_type_map {
|
||||
if !v.is_empty() {
|
||||
println!("{:?} => {} chars", k, v.len());
|
||||
}
|
||||
}*/
|
||||
// Final summary
|
||||
println!("\n🎯 EXTRACTION SUMMARY:");
|
||||
println!(" • Total high-importance events: {}", events.len());
|
||||
println!(" • Date range: 2024-01-01 to 2025-01-01");
|
||||
println!(" • Data quality: {}% complete",
|
||||
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
||||
|
||||
// Export for further analysis
|
||||
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
||||
tokio::fs::write("economic_events.json", json).await?;
|
||||
println!(" • Data exported to: economic_events.json");
|
||||
}
|
||||
|
||||
// Wait for Ctrl+C
|
||||
shutdown_handle.await.ok();
|
||||
|
||||
Reference in New Issue
Block a user