extracting data from js

This commit is contained in:
2025-11-16 15:34:30 +01:00
parent 2604caab0e
commit 59aad09f71
3 changed files with 368 additions and 74 deletions

182
Cargo.lock generated
View File

@@ -7,6 +7,7 @@ name = "WebScraper"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono",
"fantoccini", "fantoccini",
"futures", "futures",
"serde", "serde",
@@ -14,6 +15,15 @@ dependencies = [
"tokio", "tokio",
] ]
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.100" version = "1.0.100"
@@ -26,6 +36,12 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
[[package]]
name = "autocfg"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.21.7" version = "0.21.7"
@@ -44,6 +60,12 @@ version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
[[package]]
name = "bumpalo"
version = "3.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
[[package]] [[package]]
name = "bytes" name = "bytes"
version = "1.11.0" version = "1.11.0"
@@ -66,6 +88,19 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "chrono"
version = "0.4.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2"
dependencies = [
"iana-time-zone",
"js-sys",
"num-traits",
"wasm-bindgen",
"windows-link",
]
[[package]] [[package]]
name = "cookie" name = "cookie"
version = "0.16.2" version = "0.16.2"
@@ -409,6 +444,30 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "iana-time-zone"
version = "0.1.64"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"log",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]] [[package]]
name = "icu_collections" name = "icu_collections"
version = "2.1.1" version = "2.1.1"
@@ -517,6 +576,16 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "js-sys"
version = "0.3.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.177" version = "0.2.177"
@@ -596,6 +665,15 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.21.3" version = "1.21.3"
@@ -754,6 +832,12 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]] [[package]]
name = "ryu" name = "ryu"
version = "1.0.20" version = "1.0.20"
@@ -1115,6 +1199,51 @@ dependencies = [
"wit-bindgen", "wit-bindgen",
] ]
[[package]]
name = "wasm-bindgen"
version = "0.2.105"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60"
dependencies = [
"cfg-if",
"once_cell",
"rustversion",
"wasm-bindgen-macro",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.105"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.105"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc"
dependencies = [
"bumpalo",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.105"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76"
dependencies = [
"unicode-ident",
]
[[package]] [[package]]
name = "webdriver" name = "webdriver"
version = "0.50.0" version = "0.50.0"
@@ -1135,12 +1264,65 @@ dependencies = [
"url", "url",
] ]
[[package]]
name = "windows-core"
version = "0.62.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
dependencies = [
"windows-implement",
"windows-interface",
"windows-link",
"windows-result",
"windows-strings",
]
[[package]]
name = "windows-implement"
version = "0.60.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "windows-interface"
version = "0.59.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "windows-link" name = "windows-link"
version = "0.2.1" version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-result"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
dependencies = [
"windows-link",
]
[[package]]
name = "windows-strings"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
dependencies = [
"windows-link",
]
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.60.2" version = "0.60.2"

View File

@@ -10,3 +10,4 @@ serde = { version = "1", features = ["derive"] }
anyhow = "1.0" anyhow = "1.0"
futures = "0.3" futures = "0.3"
serde_json = "1.0" serde_json = "1.0"
chrono = "0.4.42"

View File

@@ -24,85 +24,172 @@ fn start_chromedriver(port: u16) -> std::process::Child {
.expect("Failed to start ChromeDriver") .expect("Failed to start ChromeDriver")
} }
async fn hide_contentpass_overlay(client: &fantoccini::Client) -> anyhow::Result<()> { async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
for _ in 0..20 { // Single strategy: wait for and remove iframe
let hidden: bool = client.execute( for _ in 0..10 {
let removed: bool = client.execute(
r#"(() => { r#"(() => {
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]'); const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
if (!iframe) return true; if (iframe && iframe.parentNode) {
iframe.style.display = 'none'; iframe.parentNode.removeChild(iframe);
iframe.style.visibility = 'hidden'; return true;
}
return false; return false;
})()"#, })()"#,
vec![] vec![]
).await?.as_bool().unwrap_or(false); ).await?.as_bool().unwrap_or(false);
if hidden { break; } if removed { break; }
tokio::time::sleep(Duration::from_millis(500)).await; sleep(Duration::from_millis(500)).await;
} }
Ok(()) Ok(())
} }
async fn scrape_events(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> { async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) -> anyhow::Result<Vec<EconomicEvent>> {
// Find all rows println!("Extracting events with {}+ stars via JavaScript...", min_stars);
let rows = client.find_all(Locator::Css(
"#TeletraderForm table tbody tr"
)).await?;
let mut events = vec![]; let extraction_script = format!(r#"
let mut empty_count = 0; const events = [];
let mut useful_count = 0; let currentDate = '';
const minStars = {};
let mut i = 0; const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
while i < rows.len() {
let cells = rows[i].find_all(Locator::Css("td")).await?;
let texts: Vec<String> = join_all(
cells.iter().map(|c| async move { c.text().await.unwrap_or_default() })
).await;
let mut description = String::new(); for (let i = 0; i < rows.length; i++) {{
// Try to get the description from the next row if it exists const row = rows[i];
if i + 1 < rows.len() { const cells = row.querySelectorAll('td');
if let Ok(desc_row) = rows[i + 1].find(Locator::Css("td p")).await {
description = desc_row.text().await.unwrap_or_default();
}
}
if (cells.length === 1 && cells[0].colSpan === 9) {{
currentDate = cells[0].textContent.trim();
continue;
}}
if (cells.length >= 8) {{
const time = cells[0]?.textContent?.trim() || '';
const country = cells[2]?.textContent?.trim() || '';
const eventName = cells[4]?.textContent?.trim() || '';
if (!time || !country || !eventName) continue;
const importanceHtml = cells[3]?.innerHTML || '';
const starCount = (importanceHtml.match(/icon--star/g) || []).length;
// Filter by minimum stars
if (starCount >= minStars) {{
let description = '';
if (i + 1 < rows.length) {{
const nextRow = rows[i + 1];
const nextCells = nextRow.querySelectorAll('td');
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {{
const descPara = nextRow.querySelector('p');
if (descPara) {{
description = descPara.textContent?.trim() || '';
}}
}}
}}
const importanceLevel = starCount === 3 ? 'High' : starCount === 2 ? 'Medium' : 'Low';
events.push({{
country: country,
date: currentDate,
time: time,
event: eventName,
actual: cells[7]?.textContent?.trim() || '',
forecast: cells[6]?.textContent?.trim() || '',
previous: cells[5]?.textContent?.trim() || '',
importance: importanceLevel,
description: description
}});
}}
}}
}}
return events;
"#, min_stars);
let result = client.execute(&extraction_script, vec![]).await?;
// Parse the JSON result into EconomicEvent structs
if let Some(events_array) = result.as_array() {
let mut events = Vec::new();
for event_value in events_array {
if let Some(event_obj) = event_value.as_object() {
let event = EconomicEvent { let event = EconomicEvent {
country: texts.get(2).cloned().unwrap_or_default(), country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(),
date: "".to_string(), date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(),
time: texts.get(0).cloned().unwrap_or_default(), time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(),
event: texts.get(4).cloned().unwrap_or_default(), event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(),
actual: texts.get(7).cloned().unwrap_or_default(), actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(),
forecast: texts.get(6).cloned().unwrap_or_default(), forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(),
previous: texts.get(5).cloned().unwrap_or_default(), previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(),
importance: texts.get(3).cloned().unwrap_or_default(), importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(),
description, description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(),
}; };
if event.event.trim().is_empty() && event.country.trim().is_empty() {
empty_count += 1;
} else {
useful_count += 1;
}
events.push(event); events.push(event);
i += 2; // skip the description row }
}
println!("Extracted {} events via JavaScript", events.len());
return Ok(events);
} }
println!("Total events found: {}", events.len()); Ok(vec![])
println!("Useful events: {}", useful_count); }
println!("Empty events: {}", empty_count);
for e in events.iter().filter(|ev| !ev.event.trim().is_empty() && !ev.country.trim().is_empty()) { async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
println!("{:?}", e); println!("Extracting event descriptions via JavaScript (3-star events only)...");
let description_script = r#"
const descriptions = {};
// Find all description rows (they have class starting with 'teletrader')
const descRows = document.querySelectorAll('tr td[class*="teletrader"]');
for (const descRow of descRows) {
// Get the description text from the <p> tag
const descPara = descRow.querySelector('p');
if (descPara) {
const description = descPara.textContent?.trim() || '';
// Find the corresponding event name by looking for the row above
let eventRow = descRow.parentElement.previousElementSibling;
if (eventRow) {
// Check if this is a 3-star event
const importanceCell = eventRow.querySelector('td:nth-child(4)');
if (importanceCell) {
const starCount = (importanceCell.innerHTML.match(/icon--star/g) || []).length;
// Only process 3-star events
if (starCount === 3) {
const eventCell = eventRow.querySelector('td:nth-child(5)');
if (eventCell) {
const eventName = eventCell.textContent?.trim() || '';
if (eventName) {
descriptions[eventName] = description;
}
}
}
}
}
}
} }
/*for e in &events { return descriptions;
println!("{:#?}", e); "#;
}*/
Ok(events) let result = client.execute(description_script, vec![]).await?;
let mut event_type_map = HashMap::new();
if let Some(desc_obj) = result.as_object() {
for (key, value) in desc_obj {
if let Some(desc_text) = value.as_str() {
event_type_map.entry(key.clone()).or_insert(desc_text.to_string());
}
}
}
println!("Extracted {} event descriptions (3-star only)", event_type_map.len());
Ok(event_type_map)
} }
#[tokio::main] #[tokio::main]
@@ -246,7 +333,7 @@ async fn main() -> anyhow::Result<()> {
} }
// Hide any reappearing overlay // Hide any reappearing overlay
hide_contentpass_overlay(&client).await?; dismiss_overlays(&client).await?;
// Wait for the tab to appear and click it // Wait for the tab to appear and click it
if let Ok(_) = client.find(Locator::Css( if let Ok(_) = client.find(Locator::Css(
@@ -272,7 +359,7 @@ async fn main() -> anyhow::Result<()> {
println!("Found {} table rows", rows.len()); println!("Found {} table rows", rows.len());
// HashMap to store "Termin" -> description // HashMap to store "Termin" -> description
let mut termin_map: HashMap<String, String> = HashMap::new(); let mut event_type_map: HashMap<String, String> = HashMap::new();
let mut i = 0; let mut i = 0;
while i < rows.len() { while i < rows.len() {
@@ -293,25 +380,49 @@ async fn main() -> anyhow::Result<()> {
// Get the hidden description // Get the hidden description
let desc_cell = next_row.find(Locator::Css("td")).await?; let desc_cell = next_row.find(Locator::Css("td")).await?;
let desc_text = desc_cell.text().await.unwrap_or_default(); let desc_text = desc_cell.text().await.unwrap_or_default();
termin_map.insert(termin_text.clone(), desc_text); event_type_map.insert(termin_text.clone(), desc_text);
i += 1; // skip next row since it's the hidden description i += 1; // skip next row since it's the hidden description
} else { } else {
termin_map.insert(termin_text.clone(), "".to_string()); event_type_map.insert(termin_text.clone(), "".to_string());
} }
} else { } else {
termin_map.insert(termin_text.clone(), "".to_string()); event_type_map.insert(termin_text.clone(), "".to_string());
} }
} }
i += 1; i += 1;
} }
let events = scrape_events(&client).await?; // Extract using JavaScript
let events = extract_all_data_via_js(&client, 3).await?;
println!("Collected {} Termin entries", termin_map.len()); // Extract descriptions using JavaScript
for (k, v) in &termin_map { let event_type_map = extract_event_descriptions_via_js(&client).await?;
println!("{:?} => {:?}", k, v);
// Merge descriptions with events
let events_with_descriptions: Vec<EconomicEvent> = events.into_iter()
.map(|mut event| {
if let Some(description) = event_type_map.get(&event.event) {
event.description = description.clone();
} }
event
})
.collect();
println!("Final results:");
for event in &events_with_descriptions {
if !event.description.is_empty() {
println!("{}: {} chars of description",
event.event, event.description.len());
}
}
/*println!("Collected {} event descriptions", event_type_map.len());
for (k, v) in &event_type_map {
if !v.is_empty() {
println!("{:?} => {} chars", k, v.len());
}
}*/
// Wait for Ctrl+C // Wait for Ctrl+C
shutdown_handle.await.ok(); shutdown_handle.await.ok();