validation of scraped data
This commit is contained in:
22002
economic_events.json
Normal file
22002
economic_events.json
Normal file
File diff suppressed because it is too large
Load Diff
171
src/main.rs
171
src/main.rs
@@ -1,10 +1,10 @@
|
|||||||
use fantoccini::{ClientBuilder, Locator};
|
use fantoccini::{ClientBuilder, Locator};
|
||||||
|
use serde::Serialize;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
use std::{collections::HashMap, process::Command};
|
use std::{collections::HashMap, process::Command};
|
||||||
use tokio::{time::{Duration, sleep}, signal};
|
use tokio::{time::{Duration, sleep}, signal};
|
||||||
use futures::future::join_all;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Serialize, Clone)]
|
||||||
struct EconomicEvent {
|
struct EconomicEvent {
|
||||||
country: String,
|
country: String,
|
||||||
date: String,
|
date: String,
|
||||||
@@ -192,6 +192,143 @@ async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyho
|
|||||||
Ok(event_type_map)
|
Ok(event_type_map)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn check_data_consistency(events: &[EconomicEvent]) {
|
||||||
|
println!("\n=== DATA CONSISTENCY CHECKS ===");
|
||||||
|
|
||||||
|
// Count event name occurrences
|
||||||
|
let mut event_names: HashMap<String, usize> = HashMap::new();
|
||||||
|
for event in events {
|
||||||
|
*event_names.entry(event.event.clone()).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect duplicates
|
||||||
|
let duplicates: Vec<_> = event_names
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, count)| **count > 1)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !duplicates.is_empty() {
|
||||||
|
println!("⚠️ Found {} duplicate event names:", duplicates.len());
|
||||||
|
for (name, count) in duplicates.iter().take(5) {
|
||||||
|
println!(" - '{}' appears {} times", name, count);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("✅ No duplicate event names found");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check time format consistency
|
||||||
|
let valid_time_format = events.iter()
|
||||||
|
.filter(|e| {
|
||||||
|
// Time should be in format "HH:MM"
|
||||||
|
e.time.len() == 5 &&
|
||||||
|
e.time.chars().nth(2) == Some(':') &&
|
||||||
|
e.time[0..2].chars().all(|c| c.is_ascii_digit()) &&
|
||||||
|
e.time[3..5].chars().all(|c| c.is_ascii_digit())
|
||||||
|
})
|
||||||
|
.count();
|
||||||
|
|
||||||
|
println!("⏰ Valid time formats: {}/{}", valid_time_format, events.len());
|
||||||
|
|
||||||
|
// Check for missing critical data
|
||||||
|
let critical_fields_missing: Vec<_> = events.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(_, e)| e.event.trim().is_empty() || e.time.trim().is_empty())
|
||||||
|
.map(|(i, e)| (i, e))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !critical_fields_missing.is_empty() {
|
||||||
|
println!("❌ {} events missing critical fields", critical_fields_missing.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn validate_events(events: &[EconomicEvent]) -> anyhow::Result<()> {
|
||||||
|
println!("\n=== EVENT VALIDATION ===");
|
||||||
|
|
||||||
|
// Check if we have any events at all
|
||||||
|
if events.is_empty() {
|
||||||
|
println!("❌ ERROR: No events extracted!");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("📊 Total events: {}", events.len());
|
||||||
|
|
||||||
|
// 1. Check date range compliance
|
||||||
|
let date_range_events: Vec<_> = events.iter()
|
||||||
|
.filter(|e| {
|
||||||
|
// Extract year from German date format "Dienstag, 2. Januar 2024"
|
||||||
|
e.date.contains("2024") || e.date.contains("2025")
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
println!("📅 Events in 2024-2025 range: {}/{}",
|
||||||
|
date_range_events.len(), events.len());
|
||||||
|
|
||||||
|
// 2. Check importance filtering
|
||||||
|
let high_importance_count = events.iter()
|
||||||
|
.filter(|e| e.importance == "High")
|
||||||
|
.count();
|
||||||
|
println!("⭐ High importance events: {}/{}", high_importance_count, events.len());
|
||||||
|
|
||||||
|
// 3. Check data completeness
|
||||||
|
let complete_events = events.iter()
|
||||||
|
.filter(|e| {
|
||||||
|
!e.event.trim().is_empty() &&
|
||||||
|
!e.time.trim().is_empty() &&
|
||||||
|
!e.country.trim().is_empty() &&
|
||||||
|
(!e.actual.trim().is_empty() || !e.forecast.trim().is_empty() || !e.previous.trim().is_empty())
|
||||||
|
})
|
||||||
|
.count();
|
||||||
|
|
||||||
|
println!("✅ Complete events: {}/{}", complete_events, events.len());
|
||||||
|
|
||||||
|
// 4. Check description coverage
|
||||||
|
let events_with_descriptions = events.iter()
|
||||||
|
.filter(|e| !e.description.trim().is_empty())
|
||||||
|
.count();
|
||||||
|
println!("📝 Events with descriptions: {}/{}", events_with_descriptions, events.len());
|
||||||
|
|
||||||
|
// 5. Distribution analysis
|
||||||
|
use std::collections::HashMap;
|
||||||
|
let mut country_distribution: HashMap<String, usize> = HashMap::new();
|
||||||
|
let mut month_distribution: HashMap<String, usize> = HashMap::new();
|
||||||
|
|
||||||
|
for event in events {
|
||||||
|
*country_distribution.entry(event.country.clone()).or_insert(0) += 1;
|
||||||
|
|
||||||
|
// Extract month from German date
|
||||||
|
if let Some(month) = extract_month(&event.date) {
|
||||||
|
*month_distribution.entry(month).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("🌍 Country distribution: {:?}", country_distribution);
|
||||||
|
println!("📈 Month distribution: {:?}", month_distribution);
|
||||||
|
|
||||||
|
// 6. Sample output for manual inspection
|
||||||
|
println!("\n🔍 Sample events (first 5):");
|
||||||
|
for event in events.iter().take(5) {
|
||||||
|
println!(" • {} {}: {} - {} (Importance: {})",
|
||||||
|
event.date, event.time, event.country, event.event, event.importance);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_month(date_str: &str) -> Option<String> {
|
||||||
|
// Extract month from German date format
|
||||||
|
let months = [
|
||||||
|
"Januar", "Februar", "März", "April", "Mai", "Juni",
|
||||||
|
"Juli", "August", "September", "Oktober", "November", "Dezember"
|
||||||
|
];
|
||||||
|
|
||||||
|
for month in months {
|
||||||
|
if date_str.contains(month) {
|
||||||
|
return Some(month.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let port = 9515; // pick a port you like
|
let port = 9515; // pick a port you like
|
||||||
@@ -400,7 +537,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
||||||
|
|
||||||
// Merge descriptions with events
|
// Merge descriptions with events
|
||||||
let events_with_descriptions: Vec<EconomicEvent> = events.into_iter()
|
let events_with_descriptions: Vec<EconomicEvent> = events.clone().into_iter()
|
||||||
.map(|mut event| {
|
.map(|mut event| {
|
||||||
if let Some(description) = event_type_map.get(&event.event) {
|
if let Some(description) = event_type_map.get(&event.event) {
|
||||||
event.description = description.clone();
|
event.description = description.clone();
|
||||||
@@ -409,20 +546,22 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
println!("Final results:");
|
// Run validation suite
|
||||||
for event in &events_with_descriptions {
|
validate_events(&events).await?;
|
||||||
if !event.description.is_empty() {
|
check_data_consistency(&events).await;
|
||||||
println!("{}: {} chars of description",
|
|
||||||
event.event, event.description.len());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*println!("Collected {} event descriptions", event_type_map.len());
|
// Final summary
|
||||||
for (k, v) in &event_type_map {
|
println!("\n🎯 EXTRACTION SUMMARY:");
|
||||||
if !v.is_empty() {
|
println!(" • Total high-importance events: {}", events.len());
|
||||||
println!("{:?} => {} chars", k, v.len());
|
println!(" • Date range: 2024-01-01 to 2025-01-01");
|
||||||
}
|
println!(" • Data quality: {}% complete",
|
||||||
}*/
|
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
||||||
|
|
||||||
|
// Export for further analysis
|
||||||
|
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
||||||
|
tokio::fs::write("economic_events.json", json).await?;
|
||||||
|
println!(" • Data exported to: economic_events.json");
|
||||||
|
}
|
||||||
|
|
||||||
// Wait for Ctrl+C
|
// Wait for Ctrl+C
|
||||||
shutdown_handle.await.ok();
|
shutdown_handle.await.ok();
|
||||||
|
|||||||
Reference in New Issue
Block a user