From 59aad09f71fc22eeab943806bacdc08049a6d4cd Mon Sep 17 00:00:00 2001 From: donpat1to Date: Sun, 16 Nov 2025 15:34:30 +0100 Subject: [PATCH] extracting data from js --- Cargo.lock | 182 +++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 +- src/main.rs | 257 +++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 368 insertions(+), 74 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94fb9b0..8218992 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7,6 +7,7 @@ name = "WebScraper" version = "0.1.0" dependencies = [ "anyhow", + "chrono", "fantoccini", "futures", "serde", @@ -14,6 +15,15 @@ dependencies = [ "tokio", ] +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anyhow" version = "1.0.100" @@ -26,6 +36,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "base64" version = "0.21.7" @@ -44,6 +60,12 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + [[package]] name = "bytes" version = "1.11.0" @@ -66,6 +88,19 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chrono" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + [[package]] name = "cookie" version = "0.16.2" @@ -409,6 +444,30 @@ dependencies = [ "tracing", ] +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "icu_collections" version = "2.1.1" @@ -517,6 +576,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "js-sys" +version = "0.3.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "libc" version = "0.2.177" @@ -596,6 +665,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -754,6 +832,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "ryu" version = "1.0.20" @@ -1115,6 +1199,51 @@ dependencies = [ "wit-bindgen", ] +[[package]] +name = "wasm-bindgen" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" +dependencies = [ + "unicode-ident", +] + [[package]] name = "webdriver" version = "0.50.0" @@ -1135,12 +1264,65 @@ dependencies = [ "url", ] +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.60.2" diff --git a/Cargo.toml b/Cargo.toml index cd05a14..0e75a5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,4 +9,5 @@ tokio = { version = "1", features = ["full"] } serde = { version = "1", features = ["derive"] } anyhow = "1.0" futures = "0.3" -serde_json = "1.0" \ No newline at end of file +serde_json = "1.0" +chrono = "0.4.42" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 136342f..eacc868 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,85 +24,172 @@ fn start_chromedriver(port: u16) -> std::process::Child { .expect("Failed to start ChromeDriver") } -async fn hide_contentpass_overlay(client: &fantoccini::Client) -> anyhow::Result<()> { - for _ in 0..20 { - let hidden: bool = client.execute( +async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> { + // Single strategy: wait for and remove iframe + for _ in 0..10 { + let removed: bool = client.execute( r#"(() => { const iframe = document.querySelector('iframe[title="Contentpass First Layer"]'); - if (!iframe) return true; - iframe.style.display = 'none'; - iframe.style.visibility = 'hidden'; + if (iframe && iframe.parentNode) { + iframe.parentNode.removeChild(iframe); + return true; + } return false; })()"#, vec![] ).await?.as_bool().unwrap_or(false); - - if hidden { break; } - tokio::time::sleep(Duration::from_millis(500)).await; + + if removed { break; } + sleep(Duration::from_millis(500)).await; } Ok(()) } -async fn scrape_events(client: &fantoccini::Client) -> anyhow::Result> { - // Find all rows - let rows = client.find_all(Locator::Css( - "#TeletraderForm table tbody tr" - )).await?; - - let mut events = vec![]; - let mut empty_count = 0; - let mut useful_count = 0; - - let mut i = 0; - while i < rows.len() { - let cells = rows[i].find_all(Locator::Css("td")).await?; - let texts: Vec = join_all( - cells.iter().map(|c| async move { c.text().await.unwrap_or_default() }) - ).await; - - let mut description = String::new(); - // Try to get the description from the next row if it exists - if i + 1 < rows.len() { - if let Ok(desc_row) = rows[i + 1].find(Locator::Css("td p")).await { - description = desc_row.text().await.unwrap_or_default(); +async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) -> anyhow::Result> { + println!("Extracting events with {}+ stars via JavaScript...", min_stars); + + let extraction_script = format!(r#" + const events = []; + let currentDate = ''; + const minStars = {}; + + const rows = document.querySelectorAll('#TeletraderForm table tbody tr'); + + for (let i = 0; i < rows.length; i++) {{ + const row = rows[i]; + const cells = row.querySelectorAll('td'); + + if (cells.length === 1 && cells[0].colSpan === 9) {{ + currentDate = cells[0].textContent.trim(); + continue; + }} + + if (cells.length >= 8) {{ + const time = cells[0]?.textContent?.trim() || ''; + const country = cells[2]?.textContent?.trim() || ''; + const eventName = cells[4]?.textContent?.trim() || ''; + + if (!time || !country || !eventName) continue; + + const importanceHtml = cells[3]?.innerHTML || ''; + const starCount = (importanceHtml.match(/icon--star/g) || []).length; + + // Filter by minimum stars + if (starCount >= minStars) {{ + let description = ''; + if (i + 1 < rows.length) {{ + const nextRow = rows[i + 1]; + const nextCells = nextRow.querySelectorAll('td'); + if (nextCells.length === 1 || nextCells[0].colSpan === 8) {{ + const descPara = nextRow.querySelector('p'); + if (descPara) {{ + description = descPara.textContent?.trim() || ''; + }} + }} + }} + + const importanceLevel = starCount === 3 ? 'High' : starCount === 2 ? 'Medium' : 'Low'; + + events.push({{ + country: country, + date: currentDate, + time: time, + event: eventName, + actual: cells[7]?.textContent?.trim() || '', + forecast: cells[6]?.textContent?.trim() || '', + previous: cells[5]?.textContent?.trim() || '', + importance: importanceLevel, + description: description + }}); + }} + }} + }} + + return events; + "#, min_stars); + + let result = client.execute(&extraction_script, vec![]).await?; + + // Parse the JSON result into EconomicEvent structs + if let Some(events_array) = result.as_array() { + let mut events = Vec::new(); + for event_value in events_array { + if let Some(event_obj) = event_value.as_object() { + let event = EconomicEvent { + country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(), + date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(), + time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(), + event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(), + actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(), + forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(), + previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(), + importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(), + description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(), + }; + events.push(event); } } + println!("Extracted {} events via JavaScript", events.len()); + return Ok(events); + } + + Ok(vec![]) +} - let event = EconomicEvent { - country: texts.get(2).cloned().unwrap_or_default(), - date: "".to_string(), - time: texts.get(0).cloned().unwrap_or_default(), - event: texts.get(4).cloned().unwrap_or_default(), - actual: texts.get(7).cloned().unwrap_or_default(), - forecast: texts.get(6).cloned().unwrap_or_default(), - previous: texts.get(5).cloned().unwrap_or_default(), - importance: texts.get(3).cloned().unwrap_or_default(), - description, - }; - - if event.event.trim().is_empty() && event.country.trim().is_empty() { - empty_count += 1; - } else { - useful_count += 1; +async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result> { + println!("Extracting event descriptions via JavaScript (3-star events only)..."); + + let description_script = r#" + const descriptions = {}; + + // Find all description rows (they have class starting with 'teletrader') + const descRows = document.querySelectorAll('tr td[class*="teletrader"]'); + + for (const descRow of descRows) { + // Get the description text from the

tag + const descPara = descRow.querySelector('p'); + if (descPara) { + const description = descPara.textContent?.trim() || ''; + + // Find the corresponding event name by looking for the row above + let eventRow = descRow.parentElement.previousElementSibling; + if (eventRow) { + // Check if this is a 3-star event + const importanceCell = eventRow.querySelector('td:nth-child(4)'); + if (importanceCell) { + const starCount = (importanceCell.innerHTML.match(/icon--star/g) || []).length; + + // Only process 3-star events + if (starCount === 3) { + const eventCell = eventRow.querySelector('td:nth-child(5)'); + if (eventCell) { + const eventName = eventCell.textContent?.trim() || ''; + if (eventName) { + descriptions[eventName] = description; + } + } + } + } + } + } + } + + return descriptions; + "#; + + let result = client.execute(description_script, vec![]).await?; + + let mut event_type_map = HashMap::new(); + if let Some(desc_obj) = result.as_object() { + for (key, value) in desc_obj { + if let Some(desc_text) = value.as_str() { + event_type_map.entry(key.clone()).or_insert(desc_text.to_string()); + } } - - events.push(event); - i += 2; // skip the description row } - - println!("Total events found: {}", events.len()); - println!("Useful events: {}", useful_count); - println!("Empty events: {}", empty_count); - - for e in events.iter().filter(|ev| !ev.event.trim().is_empty() && !ev.country.trim().is_empty()) { - println!("{:?}", e); - } - - /*for e in &events { - println!("{:#?}", e); - }*/ - - Ok(events) + + println!("Extracted {} event descriptions (3-star only)", event_type_map.len()); + Ok(event_type_map) } #[tokio::main] @@ -246,7 +333,7 @@ async fn main() -> anyhow::Result<()> { } // Hide any reappearing overlay - hide_contentpass_overlay(&client).await?; + dismiss_overlays(&client).await?; // Wait for the tab to appear and click it if let Ok(_) = client.find(Locator::Css( @@ -272,7 +359,7 @@ async fn main() -> anyhow::Result<()> { println!("Found {} table rows", rows.len()); // HashMap to store "Termin" -> description - let mut termin_map: HashMap = HashMap::new(); + let mut event_type_map: HashMap = HashMap::new(); let mut i = 0; while i < rows.len() { @@ -293,25 +380,49 @@ async fn main() -> anyhow::Result<()> { // Get the hidden description let desc_cell = next_row.find(Locator::Css("td")).await?; let desc_text = desc_cell.text().await.unwrap_or_default(); - termin_map.insert(termin_text.clone(), desc_text); + event_type_map.insert(termin_text.clone(), desc_text); i += 1; // skip next row since it's the hidden description } else { - termin_map.insert(termin_text.clone(), "".to_string()); + event_type_map.insert(termin_text.clone(), "".to_string()); } } else { - termin_map.insert(termin_text.clone(), "".to_string()); + event_type_map.insert(termin_text.clone(), "".to_string()); } } i += 1; } - let events = scrape_events(&client).await?; + // Extract using JavaScript + let events = extract_all_data_via_js(&client, 3).await?; - println!("Collected {} Termin entries", termin_map.len()); - for (k, v) in &termin_map { - println!("{:?} => {:?}", k, v); + // Extract descriptions using JavaScript + let event_type_map = extract_event_descriptions_via_js(&client).await?; + + // Merge descriptions with events + let events_with_descriptions: Vec = events.into_iter() + .map(|mut event| { + if let Some(description) = event_type_map.get(&event.event) { + event.description = description.clone(); + } + event + }) + .collect(); + + println!("Final results:"); + for event in &events_with_descriptions { + if !event.description.is_empty() { + println!("{}: {} chars of description", + event.event, event.description.len()); + } } + + /*println!("Collected {} event descriptions", event_type_map.len()); + for (k, v) in &event_type_map { + if !v.is_empty() { + println!("{:?} => {} chars", k, v.len()); + } + }*/ // Wait for Ctrl+C shutdown_handle.await.ok();