working scraping
This commit is contained in:
39
Cargo.lock
generated
39
Cargo.lock
generated
@@ -10,11 +10,21 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"fantoccini",
|
"fantoccini",
|
||||||
"futures",
|
"futures",
|
||||||
|
"regex",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "android_system_properties"
|
name = "android_system_properties"
|
||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
@@ -819,6 +829,35 @@ dependencies = [
|
|||||||
"bitflags",
|
"bitflags",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.12.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "1.1.2"
|
version = "1.1.2"
|
||||||
|
|||||||
@@ -11,3 +11,4 @@ anyhow = "1.0"
|
|||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
chrono = "0.4.42"
|
chrono = "0.4.42"
|
||||||
|
regex = "1.0"
|
||||||
File diff suppressed because it is too large
Load Diff
233
src/main.rs
233
src/main.rs
@@ -60,7 +60,30 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
const cells = row.querySelectorAll('td');
|
const cells = row.querySelectorAll('td');
|
||||||
|
|
||||||
if (cells.length === 1 && cells[0].colSpan === 9) {
|
if (cells.length === 1 && cells[0].colSpan === 9) {
|
||||||
currentDate = cells[0].textContent.trim();
|
// This is a date header row - extract and parse the date
|
||||||
|
const dateText = cells[0].textContent.trim();
|
||||||
|
console.log('Found date header:', dateText);
|
||||||
|
|
||||||
|
// Convert German date to ISO format (YYYY-MM-DD)
|
||||||
|
const monthMap = {
|
||||||
|
'Januar': '01', 'Februar': '02', 'März': '03', 'April': '04',
|
||||||
|
'Mai': '05', 'Juni': '06', 'Juli': '07', 'August': '08',
|
||||||
|
'September': '09', 'Oktober': '10', 'November': '11', 'Dezember': '12'
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract date parts from German format "Montag, 30. April 2007"
|
||||||
|
const dateParts = dateText.match(/(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})/);
|
||||||
|
if (dateParts) {
|
||||||
|
const day = dateParts[1].padStart(2, '0');
|
||||||
|
const germanMonth = dateParts[2];
|
||||||
|
const year = dateParts[3];
|
||||||
|
const month = monthMap[germanMonth] || '01';
|
||||||
|
currentDate = `${year}-${month}-${day}`;
|
||||||
|
console.log('Converted date:', currentDate, 'from:', dateText);
|
||||||
|
} else {
|
||||||
|
console.log('Failed to parse date:', dateText);
|
||||||
|
currentDate = ''; // Reset if parsing fails
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,7 +114,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
|
|
||||||
events.push({
|
events.push({
|
||||||
country: country,
|
country: country,
|
||||||
date: currentDate,
|
date: currentDate, // Now using ISO format date
|
||||||
time: time,
|
time: time,
|
||||||
event: eventName,
|
event: eventName,
|
||||||
actual: cells[7]?.textContent?.trim() || '',
|
actual: cells[7]?.textContent?.trim() || '',
|
||||||
@@ -104,6 +127,12 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('Total events extracted:', events.length);
|
||||||
|
if (events.length > 0) {
|
||||||
|
console.log('First event date:', events[0].date);
|
||||||
|
console.log('Last event date:', events[events.length - 1].date);
|
||||||
|
}
|
||||||
|
|
||||||
return events;
|
return events;
|
||||||
"#;
|
"#;
|
||||||
|
|
||||||
@@ -129,6 +158,25 @@ async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
|
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
|
||||||
|
|
||||||
|
// Debug: show date range of extracted events
|
||||||
|
if !events.is_empty() {
|
||||||
|
let dates: Vec<&str> = events.iter().map(|e| e.date.as_str()).filter(|d| !d.is_empty()).collect();
|
||||||
|
if !dates.is_empty() {
|
||||||
|
let min_date = dates.iter().min().unwrap_or(&"N/A");
|
||||||
|
let max_date = dates.iter().max().unwrap_or(&"N/A");
|
||||||
|
println!("📅 Extracted date range: {} to {}", min_date, max_date);
|
||||||
|
|
||||||
|
// Show sample of dates for debugging
|
||||||
|
println!("Sample dates:");
|
||||||
|
for (i, date) in dates.iter().take(5).enumerate() {
|
||||||
|
println!(" {}. {}", i + 1, date);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("❌ No valid dates found in extracted events");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return Ok(events);
|
return Ok(events);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -321,19 +369,68 @@ fn extract_date_from_german_format(german_date: &str) -> Option<NaiveDate> {
|
|||||||
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
|
NaiveDate::parse_from_str(&english_date, "%A, %d. %B %Y").ok()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn calculate_next_start_date(events: &[EconomicEvent]) -> String {
|
fn parse_german_date(german_date: &str) -> Option<NaiveDate> {
|
||||||
// Find the latest date in the extracted events
|
if german_date.trim().is_empty() {
|
||||||
let latest_date = events
|
return None;
|
||||||
.iter()
|
}
|
||||||
.filter_map(|e| extract_date_from_german_format(&e.date))
|
|
||||||
.max();
|
|
||||||
|
|
||||||
if let Some(latest) = latest_date {
|
// Map German month names to numbers
|
||||||
// Start from the day after the latest extracted date
|
let month_map = [
|
||||||
(latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string()
|
("Januar", 1), ("Februar", 2), ("März", 3), ("April", 4),
|
||||||
|
("Mai", 5), ("Juni", 6), ("Juli", 7), ("August", 8),
|
||||||
|
("September", 9), ("Oktober", 10), ("November", 11), ("Dezember", 12)
|
||||||
|
];
|
||||||
|
|
||||||
|
// Parse German format: "Montag, 30. April 2007"
|
||||||
|
let pattern = r"(\d{1,2})\.\s+([a-zA-Zäöüß]+)\s+(\d{4})";
|
||||||
|
let re = regex::Regex::new(pattern).unwrap();
|
||||||
|
|
||||||
|
if let Some(caps) = re.captures(german_date) {
|
||||||
|
let day = caps.get(1).unwrap().as_str().parse::<u32>().ok()?;
|
||||||
|
let german_month = caps.get(2).unwrap().as_str();
|
||||||
|
let year = caps.get(3).unwrap().as_str().parse::<i32>().ok()?;
|
||||||
|
|
||||||
|
// Find the month number
|
||||||
|
let month = month_map.iter()
|
||||||
|
.find(|(name, _)| *name == german_month)
|
||||||
|
.map(|(_, num)| *num)?;
|
||||||
|
|
||||||
|
NaiveDate::from_ymd_opt(year, month, day)
|
||||||
} else {
|
} else {
|
||||||
// Fallback: use current date
|
None
|
||||||
chrono::Local::now().format("%Y-%m-%d").to_string()
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn calculate_next_start_date(events: &[EconomicEvent]) -> Result<String, anyhow::Error> {
|
||||||
|
// Try to find dates in ISO format first
|
||||||
|
let iso_dates: Vec<NaiveDate> = events
|
||||||
|
.iter()
|
||||||
|
.filter_map(|e| NaiveDate::parse_from_str(&e.date, "%Y-%m-%d").ok())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !iso_dates.is_empty() {
|
||||||
|
if let Some(latest) = iso_dates.iter().max() {
|
||||||
|
let next_date = (*latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string();
|
||||||
|
println!("📅 Calculated next start date from ISO: {} (from latest: {})", next_date, latest);
|
||||||
|
return Ok(next_date);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: parse German dates
|
||||||
|
println!("⚠️ No ISO dates found, trying to parse German dates...");
|
||||||
|
let german_dates: Vec<NaiveDate> = events
|
||||||
|
.iter()
|
||||||
|
.filter_map(|e| parse_german_date(&e.date))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if let Some(latest) = german_dates.iter().max() {
|
||||||
|
let next_date = (*latest + ChronoDuration::days(1)).format("%Y-%m-%d").to_string();
|
||||||
|
println!("📅 Calculated next start date from German: {} (from latest: {})", next_date, latest);
|
||||||
|
Ok(next_date)
|
||||||
|
} else {
|
||||||
|
// Final fallback: use manual date increment
|
||||||
|
println!("❌ No parseable dates found, using manual increment");
|
||||||
|
Err(anyhow::anyhow!("No parseable dates found"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -342,16 +439,28 @@ async fn scrape_all_events_with_chunking(
|
|||||||
start_date: &str,
|
start_date: &str,
|
||||||
end_date: &str
|
end_date: &str
|
||||||
) -> anyhow::Result<Vec<EconomicEvent>> {
|
) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
let mut all_events = Vec::new();
|
let json_export_now = chrono::Local::now().format("%Y%m%d_%H%M%S");
|
||||||
let mut current_start = start_date.to_string();
|
|
||||||
let max_attempts = 50; // Prevent infinite loops
|
|
||||||
|
|
||||||
for attempt in 0..max_attempts {
|
let mut all_events: Vec<EconomicEvent> = Vec::new();
|
||||||
println!("🚀 Chunk {}: {} to {}", attempt + 1, current_start, end_date);
|
let mut current_start = start_date.to_string();
|
||||||
|
let mut attempts = 0;
|
||||||
|
let max_attempts = 300;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
attempts += 1;
|
||||||
|
if attempts > max_attempts {
|
||||||
|
println!("⚠️ Reached maximum attempts ({})", max_attempts);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("🚀 Chunk {}: {} to {}", attempts, current_start, end_date);
|
||||||
|
|
||||||
// Set dates for current chunk
|
// Set dates for current chunk
|
||||||
set_date_range(client, ¤t_start, end_date).await?;
|
set_date_range(client, ¤t_start, end_date).await?;
|
||||||
|
|
||||||
|
// Wait a bit longer for table to load
|
||||||
|
sleep(Duration::from_secs(3)).await;
|
||||||
|
|
||||||
// Extract events
|
// Extract events
|
||||||
let chunk_events = extract_all_data_via_js(client).await?;
|
let chunk_events = extract_all_data_via_js(client).await?;
|
||||||
|
|
||||||
@@ -361,36 +470,60 @@ async fn scrape_all_events_with_chunking(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add to total
|
// Add to total
|
||||||
|
let chunk_count = chunk_events.len();
|
||||||
all_events.extend(chunk_events.clone());
|
all_events.extend(chunk_events.clone());
|
||||||
|
|
||||||
println!("📊 Chunk {}: {} events (Total: {})",
|
println!("📊 Chunk {}: {} events (Total: {})",
|
||||||
attempt + 1, chunk_events.len(), all_events.len());
|
attempts, chunk_count, all_events.len());
|
||||||
|
|
||||||
// Check if we hit the limit and need to continue
|
// Debug: check what dates we got
|
||||||
if chunk_events.len() < 240 {
|
let sample_dates: Vec<&str> = chunk_events.iter()
|
||||||
println!("✅ Reached end of data. Completed!");
|
.map(|e| e.date.as_str())
|
||||||
break;
|
.filter(|d| !d.is_empty())
|
||||||
}
|
.take(3)
|
||||||
|
.collect();
|
||||||
|
println!(" Sample dates in chunk: {:?}", sample_dates);
|
||||||
|
|
||||||
// Calculate next start date
|
// Calculate next start date
|
||||||
let next_start = calculate_next_start_date(&chunk_events);
|
match calculate_next_start_date(&chunk_events) {
|
||||||
|
Ok(next_start) => {
|
||||||
if next_start > end_date.to_string() {
|
if next_start > end_date.to_string() {
|
||||||
println!("✅ Reached end date. Completed!");
|
println!("✅ Reached end date. Completed!");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
current_start = next_start;
|
current_start = next_start;
|
||||||
|
}
|
||||||
// Small delay between requests to be polite
|
Err(_) => {
|
||||||
sleep(Duration::from_secs(2)).await;
|
println!("❌ Could not calculate next start date. Stopping.");
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove duplicates (in case of overlapping chunks)
|
// Small delay between requests
|
||||||
|
sleep(Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
// Export chunk
|
||||||
|
if let Ok(json) = serde_json::to_string_pretty(&chunk_events) {
|
||||||
|
let filename = format!("economic_events_{}_chunk_{}.json", json_export_now, attempts);
|
||||||
|
tokio::fs::write(&filename, json).await?;
|
||||||
|
println!(" Chunk data exported to: {}", filename);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove duplicates
|
||||||
|
let initial_count = all_events.len();
|
||||||
|
all_events.sort_by(|a, b| {
|
||||||
|
a.date.cmp(&b.date)
|
||||||
|
.then(a.time.cmp(&b.time))
|
||||||
|
.then(a.event.cmp(&b.event))
|
||||||
|
});
|
||||||
all_events.dedup_by(|a, b| {
|
all_events.dedup_by(|a, b| {
|
||||||
a.date == b.date && a.time == b.time && a.event == b.event
|
a.date == b.date && a.time == b.time && a.event == b.event
|
||||||
});
|
});
|
||||||
|
|
||||||
println!("🎯 FINAL: Collected {} unique events", all_events.len());
|
println!("🎯 FINAL: Collected {} unique events (removed {} duplicates)",
|
||||||
|
all_events.len(), initial_count - all_events.len());
|
||||||
|
|
||||||
Ok(all_events)
|
Ok(all_events)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -431,9 +564,10 @@ async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) ->
|
|||||||
).await?.as_str().unwrap_or_default().to_string();
|
).await?.as_str().unwrap_or_default().to_string();
|
||||||
|
|
||||||
if from_date_value == start && to_date_value == end {
|
if from_date_value == start && to_date_value == end {
|
||||||
println!("Dates set correctly");
|
println!(" Dates set correctly");
|
||||||
} else {
|
} else {
|
||||||
println!("Date not set correctly");
|
println!(" ❌ Date not set correctly. Expected: {}-{}, Got: {}-{}",
|
||||||
|
start, end, from_date_value, to_date_value);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -441,15 +575,14 @@ async fn set_date_range(client: &fantoccini::Client, start: &str, end: &str) ->
|
|||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let port = 9515; // pick a port you like
|
let port = 9515;
|
||||||
let mut chromedriver = start_chromedriver(port);
|
let mut chromedriver = start_chromedriver(port);
|
||||||
sleep(Duration::from_secs(1)).await; // wait for ChromeDriver to start
|
sleep(Duration::from_secs(1)).await;
|
||||||
|
|
||||||
// Chrome options (non-headless so it opens)
|
// Chrome options
|
||||||
let caps_value = serde_json::json!({
|
let caps_value = serde_json::json!({
|
||||||
"goog:chromeOptions": {
|
"goog:chromeOptions": {
|
||||||
"args": [
|
"args": [
|
||||||
//"--headless",
|
|
||||||
"--disable-gpu",
|
"--disable-gpu",
|
||||||
"--disable-notifications",
|
"--disable-notifications",
|
||||||
"--disable-popup-blocking",
|
"--disable-popup-blocking",
|
||||||
@@ -482,6 +615,18 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
let url = "https://www.finanzen.net/termine/wirtschaftsdaten/";
|
||||||
client.goto(url).await?;
|
client.goto(url).await?;
|
||||||
|
|
||||||
|
// Dismiss overlays
|
||||||
|
dismiss_overlays(&client).await?;
|
||||||
|
|
||||||
|
// Click the high importance tab
|
||||||
|
if let Ok(tab) = client.find(Locator::Css(r#"div[data-sg-tab-item="teletrader-dates-three-stars"]"#)).await {
|
||||||
|
tab.click().await?;
|
||||||
|
println!("High importance tab clicked");
|
||||||
|
sleep(Duration::from_secs(2)).await;
|
||||||
|
} else {
|
||||||
|
println!("High importance tab not found");
|
||||||
|
}
|
||||||
|
|
||||||
// Use chunking to extract all events across the entire date range
|
// Use chunking to extract all events across the entire date range
|
||||||
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
let events = scrape_all_events_with_chunking(&client, "2007-02-13", "2025-12-01").await?;
|
||||||
|
|
||||||
@@ -496,23 +641,23 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
// Final summary
|
// Final summary
|
||||||
println!("\n🎯 EXTRACTION SUMMARY:");
|
println!("\n🎯 EXTRACTION SUMMARY:");
|
||||||
println!(" • Total high-importance events: {}", events.len());
|
println!(" • Total high-importance events: {}", events.len());
|
||||||
println!(" • Requested range: 2024-01-01 to 2025-01-01");
|
println!(" • Requested range: 2007-02-13 to 2025-12-01");
|
||||||
println!(" • Actual extracted range: {} to {}", actual_date_range.0, actual_date_range.1);
|
println!(" • Actual extracted range: {} to {}", actual_date_range.0, actual_date_range.1);
|
||||||
println!(" • Data extracted until: {}", current_date);
|
println!(" • Data extracted until: {}", current_date);
|
||||||
println!(" • Data quality: {}% complete",
|
println!(" • Data quality: {}% complete",
|
||||||
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
(events.iter().filter(|e| !e.event.trim().is_empty()).count() * 100) / events.len().max(1));
|
||||||
|
|
||||||
// Check if we need more runs
|
// Check coverage
|
||||||
if events.len() >= 240 {
|
if actual_date_range.1 < "2025-12-01".to_string() {
|
||||||
println!("⚠️ WARNING: Hit maximum events limit (240). Need multiple runs to get all data.");
|
println!("⚠️ WARNING: Did not reach end date. Last extracted date: {}", actual_date_range.1);
|
||||||
println!(" • Next run should start from: {}", calculate_next_start_date(&events));
|
println!(" • Next run should start from: {}", calculate_next_start_date(&events).unwrap_or_else(|_| actual_date_range.1));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Export for further analysis
|
// Export for further analysis
|
||||||
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
if let Ok(json) = serde_json::to_string_pretty(&events) {
|
||||||
let filename = format!("economic_events_{}.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));
|
let filename = format!("economic_events_{}_combined.json", chrono::Local::now().format("%Y%m%d_%H%M%S"));
|
||||||
tokio::fs::write(&filename, json).await?;
|
tokio::fs::write(&filename, json).await?;
|
||||||
println!(" • Data exported to: {}", filename);
|
println!(" • Combined data exported to: {}", filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for Ctrl+C
|
// Wait for Ctrl+C
|
||||||
|
|||||||
Reference in New Issue
Block a user