extracting data from js
This commit is contained in:
182
Cargo.lock
generated
182
Cargo.lock
generated
@@ -7,6 +7,7 @@ name = "WebScraper"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"chrono",
|
||||||
"fantoccini",
|
"fantoccini",
|
||||||
"futures",
|
"futures",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -14,6 +15,15 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "android_system_properties"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.100"
|
version = "1.0.100"
|
||||||
@@ -26,6 +36,12 @@ version = "1.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
|
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "autocfg"
|
||||||
|
version = "1.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base64"
|
name = "base64"
|
||||||
version = "0.21.7"
|
version = "0.21.7"
|
||||||
@@ -44,6 +60,12 @@ version = "2.10.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
|
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bumpalo"
|
||||||
|
version = "3.19.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytes"
|
name = "bytes"
|
||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
@@ -66,6 +88,19 @@ version = "1.0.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chrono"
|
||||||
|
version = "0.4.42"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2"
|
||||||
|
dependencies = [
|
||||||
|
"iana-time-zone",
|
||||||
|
"js-sys",
|
||||||
|
"num-traits",
|
||||||
|
"wasm-bindgen",
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cookie"
|
name = "cookie"
|
||||||
version = "0.16.2"
|
version = "0.16.2"
|
||||||
@@ -409,6 +444,30 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iana-time-zone"
|
||||||
|
version = "0.1.64"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb"
|
||||||
|
dependencies = [
|
||||||
|
"android_system_properties",
|
||||||
|
"core-foundation-sys",
|
||||||
|
"iana-time-zone-haiku",
|
||||||
|
"js-sys",
|
||||||
|
"log",
|
||||||
|
"wasm-bindgen",
|
||||||
|
"windows-core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iana-time-zone-haiku"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "icu_collections"
|
name = "icu_collections"
|
||||||
version = "2.1.1"
|
version = "2.1.1"
|
||||||
@@ -517,6 +576,16 @@ version = "1.0.15"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "js-sys"
|
||||||
|
version = "0.3.82"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65"
|
||||||
|
dependencies = [
|
||||||
|
"once_cell",
|
||||||
|
"wasm-bindgen",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.177"
|
version = "0.2.177"
|
||||||
@@ -596,6 +665,15 @@ version = "0.1.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-traits"
|
||||||
|
version = "0.2.19"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
version = "1.21.3"
|
version = "1.21.3"
|
||||||
@@ -754,6 +832,12 @@ dependencies = [
|
|||||||
"windows-sys 0.61.2",
|
"windows-sys 0.61.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustversion"
|
||||||
|
version = "1.0.22"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ryu"
|
name = "ryu"
|
||||||
version = "1.0.20"
|
version = "1.0.20"
|
||||||
@@ -1115,6 +1199,51 @@ dependencies = [
|
|||||||
"wit-bindgen",
|
"wit-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen"
|
||||||
|
version = "0.2.105"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"once_cell",
|
||||||
|
"rustversion",
|
||||||
|
"wasm-bindgen-macro",
|
||||||
|
"wasm-bindgen-shared",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen-macro"
|
||||||
|
version = "0.2.105"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2"
|
||||||
|
dependencies = [
|
||||||
|
"quote",
|
||||||
|
"wasm-bindgen-macro-support",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen-macro-support"
|
||||||
|
version = "0.2.105"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc"
|
||||||
|
dependencies = [
|
||||||
|
"bumpalo",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"wasm-bindgen-shared",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen-shared"
|
||||||
|
version = "0.2.105"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webdriver"
|
name = "webdriver"
|
||||||
version = "0.50.0"
|
version = "0.50.0"
|
||||||
@@ -1135,12 +1264,65 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-core"
|
||||||
|
version = "0.62.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
|
||||||
|
dependencies = [
|
||||||
|
"windows-implement",
|
||||||
|
"windows-interface",
|
||||||
|
"windows-link",
|
||||||
|
"windows-result",
|
||||||
|
"windows-strings",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-implement"
|
||||||
|
version = "0.60.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-interface"
|
||||||
|
version = "0.59.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-link"
|
name = "windows-link"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-result"
|
||||||
|
version = "0.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
|
||||||
|
dependencies = [
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-strings"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
|
||||||
|
dependencies = [
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.60.2"
|
version = "0.60.2"
|
||||||
|
|||||||
@@ -10,3 +10,4 @@ serde = { version = "1", features = ["derive"] }
|
|||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
|
chrono = "0.4.42"
|
||||||
239
src/main.rs
239
src/main.rs
@@ -24,85 +24,172 @@ fn start_chromedriver(port: u16) -> std::process::Child {
|
|||||||
.expect("Failed to start ChromeDriver")
|
.expect("Failed to start ChromeDriver")
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn hide_contentpass_overlay(client: &fantoccini::Client) -> anyhow::Result<()> {
|
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||||
for _ in 0..20 {
|
// Single strategy: wait for and remove iframe
|
||||||
let hidden: bool = client.execute(
|
for _ in 0..10 {
|
||||||
|
let removed: bool = client.execute(
|
||||||
r#"(() => {
|
r#"(() => {
|
||||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||||
if (!iframe) return true;
|
if (iframe && iframe.parentNode) {
|
||||||
iframe.style.display = 'none';
|
iframe.parentNode.removeChild(iframe);
|
||||||
iframe.style.visibility = 'hidden';
|
return true;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
})()"#,
|
})()"#,
|
||||||
vec![]
|
vec![]
|
||||||
).await?.as_bool().unwrap_or(false);
|
).await?.as_bool().unwrap_or(false);
|
||||||
|
|
||||||
if hidden { break; }
|
if removed { break; }
|
||||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
sleep(Duration::from_millis(500)).await;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn scrape_events(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||||
// Find all rows
|
println!("Extracting events with {}+ stars via JavaScript...", min_stars);
|
||||||
let rows = client.find_all(Locator::Css(
|
|
||||||
"#TeletraderForm table tbody tr"
|
|
||||||
)).await?;
|
|
||||||
|
|
||||||
let mut events = vec![];
|
let extraction_script = format!(r#"
|
||||||
let mut empty_count = 0;
|
const events = [];
|
||||||
let mut useful_count = 0;
|
let currentDate = '';
|
||||||
|
const minStars = {};
|
||||||
|
|
||||||
let mut i = 0;
|
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
|
||||||
while i < rows.len() {
|
|
||||||
let cells = rows[i].find_all(Locator::Css("td")).await?;
|
|
||||||
let texts: Vec<String> = join_all(
|
|
||||||
cells.iter().map(|c| async move { c.text().await.unwrap_or_default() })
|
|
||||||
).await;
|
|
||||||
|
|
||||||
let mut description = String::new();
|
for (let i = 0; i < rows.length; i++) {{
|
||||||
// Try to get the description from the next row if it exists
|
const row = rows[i];
|
||||||
if i + 1 < rows.len() {
|
const cells = row.querySelectorAll('td');
|
||||||
if let Ok(desc_row) = rows[i + 1].find(Locator::Css("td p")).await {
|
|
||||||
description = desc_row.text().await.unwrap_or_default();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (cells.length === 1 && cells[0].colSpan === 9) {{
|
||||||
|
currentDate = cells[0].textContent.trim();
|
||||||
|
continue;
|
||||||
|
}}
|
||||||
|
|
||||||
|
if (cells.length >= 8) {{
|
||||||
|
const time = cells[0]?.textContent?.trim() || '';
|
||||||
|
const country = cells[2]?.textContent?.trim() || '';
|
||||||
|
const eventName = cells[4]?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
if (!time || !country || !eventName) continue;
|
||||||
|
|
||||||
|
const importanceHtml = cells[3]?.innerHTML || '';
|
||||||
|
const starCount = (importanceHtml.match(/icon--star/g) || []).length;
|
||||||
|
|
||||||
|
// Filter by minimum stars
|
||||||
|
if (starCount >= minStars) {{
|
||||||
|
let description = '';
|
||||||
|
if (i + 1 < rows.length) {{
|
||||||
|
const nextRow = rows[i + 1];
|
||||||
|
const nextCells = nextRow.querySelectorAll('td');
|
||||||
|
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {{
|
||||||
|
const descPara = nextRow.querySelector('p');
|
||||||
|
if (descPara) {{
|
||||||
|
description = descPara.textContent?.trim() || '';
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
|
||||||
|
const importanceLevel = starCount === 3 ? 'High' : starCount === 2 ? 'Medium' : 'Low';
|
||||||
|
|
||||||
|
events.push({{
|
||||||
|
country: country,
|
||||||
|
date: currentDate,
|
||||||
|
time: time,
|
||||||
|
event: eventName,
|
||||||
|
actual: cells[7]?.textContent?.trim() || '',
|
||||||
|
forecast: cells[6]?.textContent?.trim() || '',
|
||||||
|
previous: cells[5]?.textContent?.trim() || '',
|
||||||
|
importance: importanceLevel,
|
||||||
|
description: description
|
||||||
|
}});
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
|
||||||
|
return events;
|
||||||
|
"#, min_stars);
|
||||||
|
|
||||||
|
let result = client.execute(&extraction_script, vec![]).await?;
|
||||||
|
|
||||||
|
// Parse the JSON result into EconomicEvent structs
|
||||||
|
if let Some(events_array) = result.as_array() {
|
||||||
|
let mut events = Vec::new();
|
||||||
|
for event_value in events_array {
|
||||||
|
if let Some(event_obj) = event_value.as_object() {
|
||||||
let event = EconomicEvent {
|
let event = EconomicEvent {
|
||||||
country: texts.get(2).cloned().unwrap_or_default(),
|
country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
date: "".to_string(),
|
date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
time: texts.get(0).cloned().unwrap_or_default(),
|
time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
event: texts.get(4).cloned().unwrap_or_default(),
|
event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
actual: texts.get(7).cloned().unwrap_or_default(),
|
actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
forecast: texts.get(6).cloned().unwrap_or_default(),
|
forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
previous: texts.get(5).cloned().unwrap_or_default(),
|
previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
importance: texts.get(3).cloned().unwrap_or_default(),
|
importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
description,
|
description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||||
};
|
};
|
||||||
|
|
||||||
if event.event.trim().is_empty() && event.country.trim().is_empty() {
|
|
||||||
empty_count += 1;
|
|
||||||
} else {
|
|
||||||
useful_count += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
events.push(event);
|
events.push(event);
|
||||||
i += 2; // skip the description row
|
}
|
||||||
|
}
|
||||||
|
println!("Extracted {} events via JavaScript", events.len());
|
||||||
|
return Ok(events);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("Total events found: {}", events.len());
|
Ok(vec![])
|
||||||
println!("Useful events: {}", useful_count);
|
}
|
||||||
println!("Empty events: {}", empty_count);
|
|
||||||
|
|
||||||
for e in events.iter().filter(|ev| !ev.event.trim().is_empty() && !ev.country.trim().is_empty()) {
|
async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
|
||||||
println!("{:?}", e);
|
println!("Extracting event descriptions via JavaScript (3-star events only)...");
|
||||||
|
|
||||||
|
let description_script = r#"
|
||||||
|
const descriptions = {};
|
||||||
|
|
||||||
|
// Find all description rows (they have class starting with 'teletrader')
|
||||||
|
const descRows = document.querySelectorAll('tr td[class*="teletrader"]');
|
||||||
|
|
||||||
|
for (const descRow of descRows) {
|
||||||
|
// Get the description text from the <p> tag
|
||||||
|
const descPara = descRow.querySelector('p');
|
||||||
|
if (descPara) {
|
||||||
|
const description = descPara.textContent?.trim() || '';
|
||||||
|
|
||||||
|
// Find the corresponding event name by looking for the row above
|
||||||
|
let eventRow = descRow.parentElement.previousElementSibling;
|
||||||
|
if (eventRow) {
|
||||||
|
// Check if this is a 3-star event
|
||||||
|
const importanceCell = eventRow.querySelector('td:nth-child(4)');
|
||||||
|
if (importanceCell) {
|
||||||
|
const starCount = (importanceCell.innerHTML.match(/icon--star/g) || []).length;
|
||||||
|
|
||||||
|
// Only process 3-star events
|
||||||
|
if (starCount === 3) {
|
||||||
|
const eventCell = eventRow.querySelector('td:nth-child(5)');
|
||||||
|
if (eventCell) {
|
||||||
|
const eventName = eventCell.textContent?.trim() || '';
|
||||||
|
if (eventName) {
|
||||||
|
descriptions[eventName] = description;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*for e in &events {
|
return descriptions;
|
||||||
println!("{:#?}", e);
|
"#;
|
||||||
}*/
|
|
||||||
|
|
||||||
Ok(events)
|
let result = client.execute(description_script, vec![]).await?;
|
||||||
|
|
||||||
|
let mut event_type_map = HashMap::new();
|
||||||
|
if let Some(desc_obj) = result.as_object() {
|
||||||
|
for (key, value) in desc_obj {
|
||||||
|
if let Some(desc_text) = value.as_str() {
|
||||||
|
event_type_map.entry(key.clone()).or_insert(desc_text.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Extracted {} event descriptions (3-star only)", event_type_map.len());
|
||||||
|
Ok(event_type_map)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
@@ -246,7 +333,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Hide any reappearing overlay
|
// Hide any reappearing overlay
|
||||||
hide_contentpass_overlay(&client).await?;
|
dismiss_overlays(&client).await?;
|
||||||
|
|
||||||
// Wait for the tab to appear and click it
|
// Wait for the tab to appear and click it
|
||||||
if let Ok(_) = client.find(Locator::Css(
|
if let Ok(_) = client.find(Locator::Css(
|
||||||
@@ -272,7 +359,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
println!("Found {} table rows", rows.len());
|
println!("Found {} table rows", rows.len());
|
||||||
|
|
||||||
// HashMap to store "Termin" -> description
|
// HashMap to store "Termin" -> description
|
||||||
let mut termin_map: HashMap<String, String> = HashMap::new();
|
let mut event_type_map: HashMap<String, String> = HashMap::new();
|
||||||
|
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
while i < rows.len() {
|
while i < rows.len() {
|
||||||
@@ -293,25 +380,49 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
// Get the hidden description
|
// Get the hidden description
|
||||||
let desc_cell = next_row.find(Locator::Css("td")).await?;
|
let desc_cell = next_row.find(Locator::Css("td")).await?;
|
||||||
let desc_text = desc_cell.text().await.unwrap_or_default();
|
let desc_text = desc_cell.text().await.unwrap_or_default();
|
||||||
termin_map.insert(termin_text.clone(), desc_text);
|
event_type_map.insert(termin_text.clone(), desc_text);
|
||||||
i += 1; // skip next row since it's the hidden description
|
i += 1; // skip next row since it's the hidden description
|
||||||
} else {
|
} else {
|
||||||
termin_map.insert(termin_text.clone(), "".to_string());
|
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
termin_map.insert(termin_text.clone(), "".to_string());
|
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
let events = scrape_events(&client).await?;
|
// Extract using JavaScript
|
||||||
|
let events = extract_all_data_via_js(&client, 3).await?;
|
||||||
|
|
||||||
println!("Collected {} Termin entries", termin_map.len());
|
// Extract descriptions using JavaScript
|
||||||
for (k, v) in &termin_map {
|
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
||||||
println!("{:?} => {:?}", k, v);
|
|
||||||
|
// Merge descriptions with events
|
||||||
|
let events_with_descriptions: Vec<EconomicEvent> = events.into_iter()
|
||||||
|
.map(|mut event| {
|
||||||
|
if let Some(description) = event_type_map.get(&event.event) {
|
||||||
|
event.description = description.clone();
|
||||||
}
|
}
|
||||||
|
event
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
println!("Final results:");
|
||||||
|
for event in &events_with_descriptions {
|
||||||
|
if !event.description.is_empty() {
|
||||||
|
println!("{}: {} chars of description",
|
||||||
|
event.event, event.description.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*println!("Collected {} event descriptions", event_type_map.len());
|
||||||
|
for (k, v) in &event_type_map {
|
||||||
|
if !v.is_empty() {
|
||||||
|
println!("{:?} => {} chars", k, v.len());
|
||||||
|
}
|
||||||
|
}*/
|
||||||
|
|
||||||
// Wait for Ctrl+C
|
// Wait for Ctrl+C
|
||||||
shutdown_handle.await.ok();
|
shutdown_handle.await.ok();
|
||||||
|
|||||||
Reference in New Issue
Block a user