restricting collected data to important

This commit is contained in:
2025-11-16 17:12:56 +01:00
parent 0853124918
commit d6e244c8d8
2 changed files with 35 additions and 19396 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -45,52 +45,50 @@ async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
Ok(())
}
async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) -> anyhow::Result<Vec<EconomicEvent>> {
println!("Extracting events with {}+ stars via JavaScript...", min_stars);
async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
println!("Extracting ONLY 3-star events via JavaScript...");
let extraction_script = format!(r#"
let extraction_script = r#"
const events = [];
let currentDate = '';
const minStars = {};
const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
for (let i = 0; i < rows.length; i++) {{
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const cells = row.querySelectorAll('td');
if (cells.length === 1 && cells[0].colSpan === 9) {{
if (cells.length === 1 && cells[0].colSpan === 9) {
currentDate = cells[0].textContent.trim();
continue;
}}
}
if (cells.length >= 8) {{
if (cells.length >= 8) {
const time = cells[0]?.textContent?.trim() || '';
const country = cells[2]?.textContent?.trim() || '';
const eventName = cells[4]?.textContent?.trim() || '';
if (!time || !country || !eventName) continue;
const importanceHtml = cells[3]?.innerHTML || '';
const starCount = (importanceHtml.match(/icon--star/g) || []).length;
// Count ONLY YELLOW stars (high importance)
const importanceCell = cells[3];
const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
// Filter by minimum stars
if (starCount >= minStars) {{
// STRICT FILTER: Only include events with EXACTLY 3 YELLOW stars
if (yellowStarCount === 3) {
let description = '';
if (i + 1 < rows.length) {{
if (i + 1 < rows.length) {
const nextRow = rows[i + 1];
const nextCells = nextRow.querySelectorAll('td');
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {{
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {
const descPara = nextRow.querySelector('p');
if (descPara) {{
if (descPara) {
description = descPara.textContent?.trim() || '';
}}
}}
}}
}
}
}
const importanceLevel = starCount === 3 ? 'High' : starCount === 2 ? 'Medium' : 'Low';
events.push({{
events.push({
country: country,
date: currentDate,
time: time,
@@ -98,17 +96,17 @@ async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) ->
actual: cells[7]?.textContent?.trim() || '',
forecast: cells[6]?.textContent?.trim() || '',
previous: cells[5]?.textContent?.trim() || '',
importance: importanceLevel,
importance: 'High',
description: description
}});
}}
}}
}}
});
}
}
}
return events;
"#, min_stars);
"#;
let result = client.execute(&extraction_script, vec![]).await?;
let result = client.execute(extraction_script, vec![]).await?;
// Parse the JSON result into EconomicEvent structs
if let Some(events_array) = result.as_array() {
@@ -129,7 +127,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) ->
events.push(event);
}
}
println!("Extracted {} events via JavaScript", events.len());
println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
return Ok(events);
}
@@ -137,7 +135,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) ->
}
async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
println!("Extracting event descriptions via JavaScript (3-star events only)...");
println!("Extracting event descriptions via JavaScript (3 YELLOW stars only)...");
let description_script = r#"
const descriptions = {};
@@ -154,13 +152,14 @@ async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyho
// Find the corresponding event name by looking for the row above
let eventRow = descRow.parentElement.previousElementSibling;
if (eventRow) {
// Check if this is a 3-star event
// Check if this is a 3 YELLOW star event
const importanceCell = eventRow.querySelector('td:nth-child(4)');
if (importanceCell) {
const starCount = (importanceCell.innerHTML.match(/icon--star/g) || []).length;
// Count ONLY YELLOW stars
const yellowStarCount = importanceCell.querySelectorAll('.icon--star.font-color-yellow').length;
// Only process 3-star events
if (starCount === 3) {
// Only process events with 3 YELLOW stars
if (yellowStarCount === 3) {
const eventCell = eventRow.querySelector('td:nth-child(5)');
if (eventCell) {
const eventName = eventCell.textContent?.trim() || '';
@@ -188,7 +187,7 @@ async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyho
}
}
println!("Extracted {} event descriptions (3-star only)", event_type_map.len());
println!("Extracted {} event descriptions (3 YELLOW stars only)", event_type_map.len());
Ok(event_type_map)
}
@@ -531,7 +530,7 @@ async fn main() -> anyhow::Result<()> {
}
// Extract using JavaScript
let events = extract_all_data_via_js(&client, 3).await?;
let events = extract_all_data_via_js(&client).await?;
// Extract descriptions using JavaScript
let event_type_map = extract_event_descriptions_via_js(&client).await?;