Files
WebScraper/src/corporate/scraper.rs

144 lines
4.7 KiB
Rust

// src/corporate/scraper.rs
use super::types::{CompanyEvent, CompanyPrice};
use fantoccini::{Client, Locator};
use scraper::{Html, Selector};
use chrono::{NaiveDate, Datelike};
use tokio::time::{sleep, Duration};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
pub async fn dismiss_yahoo_consent(client: &Client) -> anyhow::Result<()> {
let script = r#"
(() => {
const agree = document.querySelector('button[name="agree"]');
if (agree) {
agree.click();
return true;
}
return false;
})()
"#;
for _ in 0..10 {
let done: bool = client.execute(script, vec![]).await?.as_bool().unwrap_or(false);
if done {
break;
}
sleep(Duration::from_millis(500)).await;
}
Ok(())
}
pub async fn fetch_earnings_history(client: &Client, ticker: &str) -> anyhow::Result<Vec<CompanyEvent>> {
let url = format!("https://finance.yahoo.com/calendar/earnings?symbol={}", ticker);
client.goto(&url).await?;
dismiss_yahoo_consent(client).await?;
// Load all by clicking "Show More" if present
loop {
match client.find(Locator::XPath(r#"//button[contains(text(), 'Show More')]"#)).await {
Ok(btn) => {
btn.click().await?;
sleep(Duration::from_secs(2)).await;
}
Err(_) => break,
}
}
let html = client.source().await?;
let document = Html::parse_document(&html);
let row_sel = Selector::parse("table tbody tr").unwrap();
let mut events = Vec::new();
for row in document.select(&row_sel) {
let cols: Vec<String> = row.select(&Selector::parse("td").unwrap())
.map(|td| td.text().collect::<Vec<_>>().join(" ").trim().to_string())
.collect();
if cols.len() < 6 { continue; }
let full_date = &cols[2];
let parts: Vec<&str> = full_date.split(" at ").collect();
let raw_date = parts[0].trim();
let time_str = if parts.len() > 1 { parts[1].trim() } else { "" };
let date = match parse_yahoo_date(raw_date) {
Ok(d) => d,
Err(_) => continue,
};
let eps_forecast = parse_float(&cols[3]);
let eps_actual = if cols[4] == "-" { None } else { parse_float(&cols[4]) };
let surprise_pct = if let (Some(f), Some(a)) = (eps_forecast, eps_actual) {
if f.abs() > 0.001 { Some((a - f) / f.abs() * 100.0) } else { None }
} else { None };
let time = if time_str.contains("PM") {
"AMC".to_string()
} else if time_str.contains("AM") {
"BMO".to_string()
} else {
"".to_string()
};
events.push(CompanyEvent {
ticker: ticker.to_string(),
date: date.format("%Y-%m-%d").to_string(),
time,
period: "".to_string(), // No period info available, set to empty
eps_forecast,
eps_actual,
revenue_forecast: None,
revenue_actual: None,
surprise_pct,
source: "Yahoo".to_string(),
});
}
Ok(events)
}
pub async fn fetch_price_history(client: &Client, ticker: &str, start: &str, end: &str) -> anyhow::Result<Vec<CompanyPrice>> {
let start_ts = NaiveDate::parse_from_str(start, "%Y-%m-%d")?
.and_hms_opt(0, 0, 0).unwrap().and_utc()
.timestamp();
let end_ts = NaiveDate::parse_from_str(end, "%Y-%m-%d")?
.succ_opt().unwrap()
.and_hms_opt(0, 0, 0).unwrap().and_utc()
.timestamp();
let url = format!(
"https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={start_ts}&period2={end_ts}&interval=1d&events=history&includeAdjustedClose=true"
);
client.goto(&url).await?;
let csv = client.source().await?;
let mut prices = Vec::new();
for line in csv.lines().skip(1) {
let cols: Vec<&str> = line.split(',').collect();
if cols.len() < 7 { continue; }
prices.push(CompanyPrice {
ticker: ticker.to_string(),
date: cols[0].to_string(),
open: cols[1].parse()?,
high: cols[2].parse()?,
low: cols[3].parse()?,
close: cols[4].parse()?,
adj_close: cols[5].parse()?,
volume: cols[6].parse()?,
});
}
Ok(prices)
}
fn parse_float(s: &str) -> Option<f64> {
s.replace("--", "").replace(",", "").parse::<f64>().ok()
}
fn parse_yahoo_date(s: &str) -> anyhow::Result<NaiveDate> {
NaiveDate::parse_from_str(s, "%B %d, %Y")
.or_else(|_| NaiveDate::parse_from_str(s, "%b %d, %Y"))
.map_err(|_| anyhow::anyhow!("Bad date: {s}"))
}