extracting data from js
This commit is contained in:
257
src/main.rs
257
src/main.rs
@@ -24,85 +24,172 @@ fn start_chromedriver(port: u16) -> std::process::Child {
|
||||
.expect("Failed to start ChromeDriver")
|
||||
}
|
||||
|
||||
async fn hide_contentpass_overlay(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||
for _ in 0..20 {
|
||||
let hidden: bool = client.execute(
|
||||
async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
|
||||
// Single strategy: wait for and remove iframe
|
||||
for _ in 0..10 {
|
||||
let removed: bool = client.execute(
|
||||
r#"(() => {
|
||||
const iframe = document.querySelector('iframe[title="Contentpass First Layer"]');
|
||||
if (!iframe) return true;
|
||||
iframe.style.display = 'none';
|
||||
iframe.style.visibility = 'hidden';
|
||||
if (iframe && iframe.parentNode) {
|
||||
iframe.parentNode.removeChild(iframe);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
})()"#,
|
||||
vec![]
|
||||
).await?.as_bool().unwrap_or(false);
|
||||
|
||||
if hidden { break; }
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
|
||||
if removed { break; }
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn scrape_events(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
// Find all rows
|
||||
let rows = client.find_all(Locator::Css(
|
||||
"#TeletraderForm table tbody tr"
|
||||
)).await?;
|
||||
|
||||
let mut events = vec![];
|
||||
let mut empty_count = 0;
|
||||
let mut useful_count = 0;
|
||||
|
||||
let mut i = 0;
|
||||
while i < rows.len() {
|
||||
let cells = rows[i].find_all(Locator::Css("td")).await?;
|
||||
let texts: Vec<String> = join_all(
|
||||
cells.iter().map(|c| async move { c.text().await.unwrap_or_default() })
|
||||
).await;
|
||||
|
||||
let mut description = String::new();
|
||||
// Try to get the description from the next row if it exists
|
||||
if i + 1 < rows.len() {
|
||||
if let Ok(desc_row) = rows[i + 1].find(Locator::Css("td p")).await {
|
||||
description = desc_row.text().await.unwrap_or_default();
|
||||
async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) -> anyhow::Result<Vec<EconomicEvent>> {
|
||||
println!("Extracting events with {}+ stars via JavaScript...", min_stars);
|
||||
|
||||
let extraction_script = format!(r#"
|
||||
const events = [];
|
||||
let currentDate = '';
|
||||
const minStars = {};
|
||||
|
||||
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
|
||||
|
||||
for (let i = 0; i < rows.length; i++) {{
|
||||
const row = rows[i];
|
||||
const cells = row.querySelectorAll('td');
|
||||
|
||||
if (cells.length === 1 && cells[0].colSpan === 9) {{
|
||||
currentDate = cells[0].textContent.trim();
|
||||
continue;
|
||||
}}
|
||||
|
||||
if (cells.length >= 8) {{
|
||||
const time = cells[0]?.textContent?.trim() || '';
|
||||
const country = cells[2]?.textContent?.trim() || '';
|
||||
const eventName = cells[4]?.textContent?.trim() || '';
|
||||
|
||||
if (!time || !country || !eventName) continue;
|
||||
|
||||
const importanceHtml = cells[3]?.innerHTML || '';
|
||||
const starCount = (importanceHtml.match(/icon--star/g) || []).length;
|
||||
|
||||
// Filter by minimum stars
|
||||
if (starCount >= minStars) {{
|
||||
let description = '';
|
||||
if (i + 1 < rows.length) {{
|
||||
const nextRow = rows[i + 1];
|
||||
const nextCells = nextRow.querySelectorAll('td');
|
||||
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {{
|
||||
const descPara = nextRow.querySelector('p');
|
||||
if (descPara) {{
|
||||
description = descPara.textContent?.trim() || '';
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
const importanceLevel = starCount === 3 ? 'High' : starCount === 2 ? 'Medium' : 'Low';
|
||||
|
||||
events.push({{
|
||||
country: country,
|
||||
date: currentDate,
|
||||
time: time,
|
||||
event: eventName,
|
||||
actual: cells[7]?.textContent?.trim() || '',
|
||||
forecast: cells[6]?.textContent?.trim() || '',
|
||||
previous: cells[5]?.textContent?.trim() || '',
|
||||
importance: importanceLevel,
|
||||
description: description
|
||||
}});
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
return events;
|
||||
"#, min_stars);
|
||||
|
||||
let result = client.execute(&extraction_script, vec![]).await?;
|
||||
|
||||
// Parse the JSON result into EconomicEvent structs
|
||||
if let Some(events_array) = result.as_array() {
|
||||
let mut events = Vec::new();
|
||||
for event_value in events_array {
|
||||
if let Some(event_obj) = event_value.as_object() {
|
||||
let event = EconomicEvent {
|
||||
country: event_obj.get("country").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
date: event_obj.get("date").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
time: event_obj.get("time").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
event: event_obj.get("event").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
actual: event_obj.get("actual").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
forecast: event_obj.get("forecast").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
previous: event_obj.get("previous").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
importance: event_obj.get("importance").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
description: event_obj.get("description").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
};
|
||||
events.push(event);
|
||||
}
|
||||
}
|
||||
println!("Extracted {} events via JavaScript", events.len());
|
||||
return Ok(events);
|
||||
}
|
||||
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
let event = EconomicEvent {
|
||||
country: texts.get(2).cloned().unwrap_or_default(),
|
||||
date: "".to_string(),
|
||||
time: texts.get(0).cloned().unwrap_or_default(),
|
||||
event: texts.get(4).cloned().unwrap_or_default(),
|
||||
actual: texts.get(7).cloned().unwrap_or_default(),
|
||||
forecast: texts.get(6).cloned().unwrap_or_default(),
|
||||
previous: texts.get(5).cloned().unwrap_or_default(),
|
||||
importance: texts.get(3).cloned().unwrap_or_default(),
|
||||
description,
|
||||
};
|
||||
|
||||
if event.event.trim().is_empty() && event.country.trim().is_empty() {
|
||||
empty_count += 1;
|
||||
} else {
|
||||
useful_count += 1;
|
||||
async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
|
||||
println!("Extracting event descriptions via JavaScript (3-star events only)...");
|
||||
|
||||
let description_script = r#"
|
||||
const descriptions = {};
|
||||
|
||||
// Find all description rows (they have class starting with 'teletrader')
|
||||
const descRows = document.querySelectorAll('tr td[class*="teletrader"]');
|
||||
|
||||
for (const descRow of descRows) {
|
||||
// Get the description text from the <p> tag
|
||||
const descPara = descRow.querySelector('p');
|
||||
if (descPara) {
|
||||
const description = descPara.textContent?.trim() || '';
|
||||
|
||||
// Find the corresponding event name by looking for the row above
|
||||
let eventRow = descRow.parentElement.previousElementSibling;
|
||||
if (eventRow) {
|
||||
// Check if this is a 3-star event
|
||||
const importanceCell = eventRow.querySelector('td:nth-child(4)');
|
||||
if (importanceCell) {
|
||||
const starCount = (importanceCell.innerHTML.match(/icon--star/g) || []).length;
|
||||
|
||||
// Only process 3-star events
|
||||
if (starCount === 3) {
|
||||
const eventCell = eventRow.querySelector('td:nth-child(5)');
|
||||
if (eventCell) {
|
||||
const eventName = eventCell.textContent?.trim() || '';
|
||||
if (eventName) {
|
||||
descriptions[eventName] = description;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return descriptions;
|
||||
"#;
|
||||
|
||||
let result = client.execute(description_script, vec![]).await?;
|
||||
|
||||
let mut event_type_map = HashMap::new();
|
||||
if let Some(desc_obj) = result.as_object() {
|
||||
for (key, value) in desc_obj {
|
||||
if let Some(desc_text) = value.as_str() {
|
||||
event_type_map.entry(key.clone()).or_insert(desc_text.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
events.push(event);
|
||||
i += 2; // skip the description row
|
||||
}
|
||||
|
||||
println!("Total events found: {}", events.len());
|
||||
println!("Useful events: {}", useful_count);
|
||||
println!("Empty events: {}", empty_count);
|
||||
|
||||
for e in events.iter().filter(|ev| !ev.event.trim().is_empty() && !ev.country.trim().is_empty()) {
|
||||
println!("{:?}", e);
|
||||
}
|
||||
|
||||
/*for e in &events {
|
||||
println!("{:#?}", e);
|
||||
}*/
|
||||
|
||||
Ok(events)
|
||||
|
||||
println!("Extracted {} event descriptions (3-star only)", event_type_map.len());
|
||||
Ok(event_type_map)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -246,7 +333,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
// Hide any reappearing overlay
|
||||
hide_contentpass_overlay(&client).await?;
|
||||
dismiss_overlays(&client).await?;
|
||||
|
||||
// Wait for the tab to appear and click it
|
||||
if let Ok(_) = client.find(Locator::Css(
|
||||
@@ -272,7 +359,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("Found {} table rows", rows.len());
|
||||
|
||||
// HashMap to store "Termin" -> description
|
||||
let mut termin_map: HashMap<String, String> = HashMap::new();
|
||||
let mut event_type_map: HashMap<String, String> = HashMap::new();
|
||||
|
||||
let mut i = 0;
|
||||
while i < rows.len() {
|
||||
@@ -293,25 +380,49 @@ async fn main() -> anyhow::Result<()> {
|
||||
// Get the hidden description
|
||||
let desc_cell = next_row.find(Locator::Css("td")).await?;
|
||||
let desc_text = desc_cell.text().await.unwrap_or_default();
|
||||
termin_map.insert(termin_text.clone(), desc_text);
|
||||
event_type_map.insert(termin_text.clone(), desc_text);
|
||||
i += 1; // skip next row since it's the hidden description
|
||||
} else {
|
||||
termin_map.insert(termin_text.clone(), "".to_string());
|
||||
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||
}
|
||||
} else {
|
||||
termin_map.insert(termin_text.clone(), "".to_string());
|
||||
event_type_map.insert(termin_text.clone(), "".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
let events = scrape_events(&client).await?;
|
||||
// Extract using JavaScript
|
||||
let events = extract_all_data_via_js(&client, 3).await?;
|
||||
|
||||
println!("Collected {} Termin entries", termin_map.len());
|
||||
for (k, v) in &termin_map {
|
||||
println!("{:?} => {:?}", k, v);
|
||||
// Extract descriptions using JavaScript
|
||||
let event_type_map = extract_event_descriptions_via_js(&client).await?;
|
||||
|
||||
// Merge descriptions with events
|
||||
let events_with_descriptions: Vec<EconomicEvent> = events.into_iter()
|
||||
.map(|mut event| {
|
||||
if let Some(description) = event_type_map.get(&event.event) {
|
||||
event.description = description.clone();
|
||||
}
|
||||
event
|
||||
})
|
||||
.collect();
|
||||
|
||||
println!("Final results:");
|
||||
for event in &events_with_descriptions {
|
||||
if !event.description.is_empty() {
|
||||
println!("{}: {} chars of description",
|
||||
event.event, event.description.len());
|
||||
}
|
||||
}
|
||||
|
||||
/*println!("Collected {} event descriptions", event_type_map.len());
|
||||
for (k, v) in &event_type_map {
|
||||
if !v.is_empty() {
|
||||
println!("{:?} => {} chars", k, v.len());
|
||||
}
|
||||
}*/
|
||||
|
||||
// Wait for Ctrl+C
|
||||
shutdown_handle.await.ok();
|
||||
|
||||
Reference in New Issue
Block a user