extracting data from js

This commit is contained in:
2025-11-16 15:34:30 +01:00
parent 2604caab0e
commit 59aad09f71
3 changed files with 368 additions and 74 deletions

View File

@@ -24,85 +24,172 @@ fn start_chromedriver(port: u16) -> std::process::Child {
.expect("Failed to start ChromeDriver")
}
async fn hide_contentpass_overlay(client: &fantoccini::Client) -> anyhow::Result<()> {
for _ in 0..20 {
let hidden: bool = client.execute(
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
// Single strategy: wait for and remove iframe
for _ in 0..10 {
let removed: bool = client.execute(
r#"(() => {
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
if (!iframe) return true;
iframe.style.display = 'none';
iframe.style.visibility = 'hidden';
if (iframe && iframe.parentNode) {
iframe.parentNode.removeChild(iframe);
return true;
}
return false;
})()"#,
vec![]
).await?.as_bool().unwrap_or(false);
if hidden { break; }
tokio::time::sleep(Duration::from_millis(500)).await;
if removed { break; }
sleep(Duration::from_millis(500)).await;
}
Ok(())
}
async fn scrape_events(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
// Find all rows
let rows = client.find_all(Locator::Css(
"#TeletraderForm table tbody tr"
)).await?;
let mut events = vec![];
let mut empty_count = 0;
let mut useful_count = 0;
let mut i = 0;
while i < rows.len() {
let cells = rows[i].find_all(Locator::Css("td")).await?;
let texts: Vec<String> = join_all(
cells.iter().map(|c| async move { c.text().await.unwrap_or_default() })
).await;
let mut description = String::new();
// Try to get the description from the next row if it exists
if i + 1 < rows.len() {
if let Ok(desc_row) = rows[i + 1].find(Locator::Css("td p")).await {
description = desc_row.text().await.unwrap_or_default();
async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) -> anyhow::Result<Vec<EconomicEvent>> {
println!("Extracting events with {}+ stars via JavaScript...", min_stars);
let extraction_script = format!(r#"
const events = [];
let currentDate = '';
const minStars = {};
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
for (let i = 0; i < rows.length; i++) {{
const row = rows[i];
const cells = row.querySelectorAll('td');
if (cells.length === 1 && cells[0].colSpan === 9) {{
currentDate = cells[0].textContent.trim();
continue;
}}
if (cells.length >= 8) {{
const time = cells[0]?.textContent?.trim() || '';
const country = cells[2]?.textContent?.trim() || '';
const eventName = cells[4]?.textContent?.trim() || '';
if (!time || !country || !eventName) continue;
const importanceHtml = cells[3]?.innerHTML || '';
const starCount = (importanceHtml.match(/icon--star/g) || []).length;
// Filter by minimum stars
if (starCount >= minStars) {{
let description = '';
if (i + 1 < rows.length) {{
const nextRow = rows[i + 1];
const nextCells = nextRow.querySelectorAll('td');
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {{
const descPara = nextRow.querySelector('p');
if (descPara) {{
description = descPara.textContent?.trim() || '';
}}
}}
}}
const importanceLevel = starCount === 3 ? 'High' : starCount === 2 ? 'Medium' : 'Low';
events.push({{
country: country,
date: currentDate,
time: time,
event: eventName,
actual: cells[7]?.textContent?.trim() || '',
forecast: cells[6]?.textContent?.trim() || '',
previous: cells[5]?.textContent?.trim() || '',
importance: importanceLevel,
description: description
}});
}}
}}
}}
return events;
"#, min_stars);
let result = client.execute(&extraction_script, vec![]).await?;
// Parse the JSON result into EconomicEvent structs
if let Some(events_array) = result.as_array() {
let mut events = Vec::new();
for event_value in events_array {
if let Some(event_obj) = event_value.as_object() {
let event = EconomicEvent {
country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(),
date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(),
time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(),
event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(),
actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(),
forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(),
previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(),
importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(),
description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(),
};
events.push(event);
}
}
println!("Extracted {} events via JavaScript", events.len());
return Ok(events);
}
Ok(vec![])
}
let event = EconomicEvent {
country: texts.get(2).cloned().unwrap_or_default(),
date: "".to_string(),
time: texts.get(0).cloned().unwrap_or_default(),
event: texts.get(4).cloned().unwrap_or_default(),
actual: texts.get(7).cloned().unwrap_or_default(),
forecast: texts.get(6).cloned().unwrap_or_default(),
previous: texts.get(5).cloned().unwrap_or_default(),
importance: texts.get(3).cloned().unwrap_or_default(),
description,
};
if event.event.trim().is_empty() && event.country.trim().is_empty() {
empty_count += 1;
} else {
useful_count += 1;
async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
println!("Extracting event descriptions via JavaScript (3-star events only)...");
let description_script = r#"
const descriptions = {};
// Find all description rows (they have class starting with 'teletrader')
const descRows = document.querySelectorAll('tr td[class*="teletrader"]');
for (const descRow of descRows) {
// Get the description text from the <p> tag
const descPara = descRow.querySelector('p');
if (descPara) {
const description = descPara.textContent?.trim() || '';
// Find the corresponding event name by looking for the row above
let eventRow = descRow.parentElement.previousElementSibling;
if (eventRow) {
// Check if this is a 3-star event
const importanceCell = eventRow.querySelector('td:nth-child(4)');
if (importanceCell) {
const starCount = (importanceCell.innerHTML.match(/icon--star/g) || []).length;
// Only process 3-star events
if (starCount === 3) {
const eventCell = eventRow.querySelector('td:nth-child(5)');
if (eventCell) {
const eventName = eventCell.textContent?.trim() || '';
if (eventName) {
descriptions[eventName] = description;
}
}
}
}
}
}
}
return descriptions;
"#;
let result = client.execute(description_script, vec![]).await?;
let mut event_type_map = HashMap::new();
if let Some(desc_obj) = result.as_object() {
for (key, value) in desc_obj {
if let Some(desc_text) = value.as_str() {
event_type_map.entry(key.clone()).or_insert(desc_text.to_string());
}
}
events.push(event);
i += 2; // skip the description row
}
println!("Total events found: {}", events.len());
println!("Useful events: {}", useful_count);
println!("Empty events: {}", empty_count);
for e in events.iter().filter(|ev| !ev.event.trim().is_empty() && !ev.country.trim().is_empty()) {
println!("{:?}", e);
}
/*for e in &events {
println!("{:#?}", e);
}*/
Ok(events)
println!("Extracted {} event descriptions (3-star only)", event_type_map.len());
Ok(event_type_map)
}
#[tokio::main]
@@ -246,7 +333,7 @@ async fn main() -> anyhow::Result<()> {
}
// Hide any reappearing overlay
hide_contentpass_overlay(&client).await?;
dismiss_overlays(&client).await?;
// Wait for the tab to appear and click it
if let Ok(_) = client.find(Locator::Css(
@@ -272,7 +359,7 @@ async fn main() -> anyhow::Result<()> {
println!("Found {} table rows", rows.len());
// HashMap to store "Termin" -> description
let mut termin_map: HashMap<String, String> = HashMap::new();
let mut event_type_map: HashMap<String, String> = HashMap::new();
let mut i = 0;
while i < rows.len() {
@@ -293,25 +380,49 @@ async fn main() -> anyhow::Result<()> {
// Get the hidden description
let desc_cell = next_row.find(Locator::Css("td")).await?;
let desc_text = desc_cell.text().await.unwrap_or_default();
termin_map.insert(termin_text.clone(), desc_text);
event_type_map.insert(termin_text.clone(), desc_text);
i += 1; // skip next row since it's the hidden description
} else {
termin_map.insert(termin_text.clone(), "".to_string());
event_type_map.insert(termin_text.clone(), "".to_string());
}
} else {
termin_map.insert(termin_text.clone(), "".to_string());
event_type_map.insert(termin_text.clone(), "".to_string());
}
}
i += 1;
}
let events = scrape_events(&client).await?;
// Extract using JavaScript
let events = extract_all_data_via_js(&client, 3).await?;
println!("Collected {} Termin entries", termin_map.len());
for (k, v) in &termin_map {
println!("{:?} => {:?}", k, v);
// Extract descriptions using JavaScript
let event_type_map = extract_event_descriptions_via_js(&client).await?;
// Merge descriptions with events
let events_with_descriptions: Vec<EconomicEvent> = events.into_iter()
.map(|mut event| {
if let Some(description) = event_type_map.get(&event.event) {
event.description = description.clone();
}
event
})
.collect();
println!("Final results:");
for event in &events_with_descriptions {
if !event.description.is_empty() {
println!("{}: {} chars of description",
event.event, event.description.len());
}
}
/*println!("Collected {} event descriptions", event_type_map.len());
for (k, v) in &event_type_map {
if !v.is_empty() {
println!("{:?} => {} chars", k, v.len());
}
}*/
// Wait for Ctrl+C
shutdown_handle.await.ok();