restricting collected data to important

This commit is contained in:
2025-11-16 17:12:56 +01:00
parent 0853124918
commit d6e244c8d8
2 changed files with 35 additions and 19396 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -45,52 +45,50 @@ async fn dismiss_overlays(client: &fantoccini::Client) -> anyhow::Result<()> {
Ok(()) Ok(())
} }
async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) -> anyhow::Result<Vec<EconomicEvent>> { async fn extract_all_data_via_js(client: &fantoccini::Client) -> anyhow::Result<Vec<EconomicEvent>> {
println!("Extracting events with {}+ stars via JavaScript...", min_stars); println!("Extracting ONLY 3-star events via JavaScript...");
let extraction_script = format!(r#" let extraction_script = r#"
const events = []; const events = [];
let currentDate = ''; let currentDate = '';
const minStars = {};
const rows = document.querySelectorAll('#TeletraderForm table tbody tr'); const rows = document.querySelectorAll('#TeletraderForm table tbody tr');
for (let i = 0; i < rows.length; i++) {{ for (let i = 0; i < rows.length; i++) {
const row = rows[i]; const row = rows[i];
const cells = row.querySelectorAll('td'); const cells = row.querySelectorAll('td');
if (cells.length === 1 && cells[0].colSpan === 9) {{ if (cells.length === 1 && cells[0].colSpan === 9) {
currentDate = cells[0].textContent.trim(); currentDate = cells[0].textContent.trim();
continue; continue;
}} }
if (cells.length >= 8) {{ if (cells.length >= 8) {
const time = cells[0]?.textContent?.trim() || ''; const time = cells[0]?.textContent?.trim() || '';
const country = cells[2]?.textContent?.trim() || ''; const country = cells[2]?.textContent?.trim() || '';
const eventName = cells[4]?.textContent?.trim() || ''; const eventName = cells[4]?.textContent?.trim() || '';
if (!time || !country || !eventName) continue; if (!time || !country || !eventName) continue;
const importanceHtml = cells[3]?.innerHTML || ''; // Count ONLY YELLOW stars (high importance)
const starCount = (importanceHtml.match(/icon--star/g) || []).length; const importanceCell = cells[3];
const yellowStarCount = importanceCell?.querySelectorAll('.icon--star.font-color-yellow').length || 0;
// Filter by minimum stars // STRICT FILTER: Only include events with EXACTLY 3 YELLOW stars
if (starCount >= minStars) {{ if (yellowStarCount === 3) {
let description = ''; let description = '';
if (i + 1 < rows.length) {{ if (i + 1 < rows.length) {
const nextRow = rows[i + 1]; const nextRow = rows[i + 1];
const nextCells = nextRow.querySelectorAll('td'); const nextCells = nextRow.querySelectorAll('td');
if (nextCells.length === 1 || nextCells[0].colSpan === 8) {{ if (nextCells.length === 1 || nextCells[0].colSpan === 8) {
const descPara = nextRow.querySelector('p'); const descPara = nextRow.querySelector('p');
if (descPara) {{ if (descPara) {
description = descPara.textContent?.trim() || ''; description = descPara.textContent?.trim() || '';
}} }
}} }
}} }
const importanceLevel = starCount === 3 ? 'High' : starCount === 2 ? 'Medium' : 'Low'; events.push({
events.push({{
country: country, country: country,
date: currentDate, date: currentDate,
time: time, time: time,
@@ -98,17 +96,17 @@ async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) ->
actual: cells[7]?.textContent?.trim() || '', actual: cells[7]?.textContent?.trim() || '',
forecast: cells[6]?.textContent?.trim() || '', forecast: cells[6]?.textContent?.trim() || '',
previous: cells[5]?.textContent?.trim() || '', previous: cells[5]?.textContent?.trim() || '',
importance: importanceLevel, importance: 'High',
description: description description: description
}}); });
}} }
}} }
}} }
return events; return events;
"#, min_stars); "#;
let result = client.execute(&extraction_script, vec![]).await?; let result = client.execute(extraction_script, vec![]).await?;
// Parse the JSON result into EconomicEvent structs // Parse the JSON result into EconomicEvent structs
if let Some(events_array) = result.as_array() { if let Some(events_array) = result.as_array() {
@@ -129,7 +127,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) ->
events.push(event); events.push(event);
} }
} }
println!("Extracted {} events via JavaScript", events.len()); println!("Extracted {} events (3 YELLOW stars ONLY) via JavaScript", events.len());
return Ok(events); return Ok(events);
} }
@@ -137,7 +135,7 @@ async fn extract_all_data_via_js(client: &fantoccini::Client, min_stars: u8) ->
} }
async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> { async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyhow::Result<HashMap<String, String>> {
println!("Extracting event descriptions via JavaScript (3-star events only)..."); println!("Extracting event descriptions via JavaScript (3 YELLOW stars only)...");
let description_script = r#" let description_script = r#"
const descriptions = {}; const descriptions = {};
@@ -154,13 +152,14 @@ async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyho
// Find the corresponding event name by looking for the row above // Find the corresponding event name by looking for the row above
let eventRow = descRow.parentElement.previousElementSibling; let eventRow = descRow.parentElement.previousElementSibling;
if (eventRow) { if (eventRow) {
// Check if this is a 3-star event // Check if this is a 3 YELLOW star event
const importanceCell = eventRow.querySelector('td:nth-child(4)'); const importanceCell = eventRow.querySelector('td:nth-child(4)');
if (importanceCell) { if (importanceCell) {
const starCount = (importanceCell.innerHTML.match(/icon--star/g) || []).length; // Count ONLY YELLOW stars
const yellowStarCount = importanceCell.querySelectorAll('.icon--star.font-color-yellow').length;
// Only process 3-star events // Only process events with 3 YELLOW stars
if (starCount === 3) { if (yellowStarCount === 3) {
const eventCell = eventRow.querySelector('td:nth-child(5)'); const eventCell = eventRow.querySelector('td:nth-child(5)');
if (eventCell) { if (eventCell) {
const eventName = eventCell.textContent?.trim() || ''; const eventName = eventCell.textContent?.trim() || '';
@@ -188,7 +187,7 @@ async fn extract_event_descriptions_via_js(client: &fantoccini::Client) -> anyho
} }
} }
println!("Extracted {} event descriptions (3-star only)", event_type_map.len()); println!("Extracted {} event descriptions (3 YELLOW stars only)", event_type_map.len());
Ok(event_type_map) Ok(event_type_map)
} }
@@ -531,7 +530,7 @@ async fn main() -> anyhow::Result<()> {
} }
// Extract using JavaScript // Extract using JavaScript
let events = extract_all_data_via_js(&client, 3).await?; let events = extract_all_data_via_js(&client).await?;
// Extract descriptions using JavaScript // Extract descriptions using JavaScript
let event_type_map = extract_event_descriptions_via_js(&client).await?; let event_type_map = extract_event_descriptions_via_js(&client).await?;