working api calls

This commit is contained in:
2025-12-02 17:10:34 +01:00
parent de875a3ebe
commit 95fd9ca141
6 changed files with 1104 additions and 323 deletions

219
src/scraper/webdriver.rs Normal file
View File

@@ -0,0 +1,219 @@
// src/scraper/webdriver.rs
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, ClientBuilder};
use std::process::{Stdio};
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::process::{Child, Command};
use tokio::time::{sleep, Duration};
use std::pin::Pin;
/// Represents a single instance of chromedriver process.
///
/// This struct manages the lifecycle of a chromedriver process, starting it on a random available port
/// and providing a way to connect to it via fantoccini Client. Each instance is independent, allowing
/// for isolated scraping sessions without interference.
///
/// # Examples
///
/// ```no_run
/// use crate::scraper::webdriver::ChromeInstance;
///
/// #[tokio::main]
/// async fn main() -> anyhow::Result<()> {
/// let instance = ChromeInstance::new().await?;
/// let client = instance.new_client().await?;
/// // Use client for scraping
/// client.close().await?;
/// Ok(())
/// }
/// ```
pub struct ChromeInstance {
process: Child,
url: String,
}
impl ChromeInstance {
/// Creates a new ChromeInstance by spawning a chromedriver process on a random port.
///
/// This function spawns chromedriver with `--port=0` to let it choose an available port,
/// reads the stdout to extract the listening URL, and returns the instance if successful.
///
/// # Errors
///
/// Returns an error if:
/// - chromedriver cannot be spawned (e.g., not found in PATH).
/// - Failed to read stdout or parse the listening URL within a reasonable time.
pub async fn new() -> Result<Self> {
let mut child = Command::new("chromedriver")
.arg("--port=0")
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.context("Failed to spawn chromedriver process")?;
let stdout = child.stdout.take().context("Failed to take stdout")?;
let mut reader = BufReader::new(stdout);
let mut line = String::new();
let start_time = std::time::Instant::now();
let timeout = Duration::from_secs(10);
loop {
if start_time.elapsed() > timeout {
let _ = child.kill().await;
return Err(anyhow!("Timeout waiting for chromedriver to start"));
}
line.clear();
if reader.read_line(&mut line).await.context("Failed to read line from stdout")? == 0 {
// EOF reached unexpectedly
let mut stderr_output = String::new();
if let Some(mut stderr) = child.stderr.take() {
let mut stderr_reader = BufReader::new(&mut stderr);
let mut stderr_line = String::new();
while stderr_reader.read_line(&mut stderr_line).await? > 0 {
stderr_output.push_str(&stderr_line);
stderr_line.clear();
}
}
let _ = child.kill().await;
return Err(anyhow!("Chromedriver exited unexpectedly. Stderr: {}", stderr_output));
}
if let Some(url) = Self::extract_url(&line) {
return Ok(Self {
process: child,
url,
});
}
sleep(Duration::from_millis(100)).await;
}
}
/// Extracts the listening URL from chromedriver's output line.
///
/// Looks for lines like "Starting ChromeDriver ... port=XXXX" or "Listening on 127.0.0.1:XXXX".
/// Returns the full URL like "http://127.0.0.1:XXXX" if found.
fn extract_url(line: &str) -> Option<String> {
if line.contains("Listening on") || line.contains("port=") {
// Simple regex-like parsing; adjust based on actual output
let parts: Vec<&str> = line.split_whitespace().collect();
for part in parts {
if part.starts_with("127.0.0.1:") || part.starts_with("localhost:") {
return Some(format!("http://{}", part));
} else if part.starts_with("port=") {
let port = part.split('=').nth(1)?;
return Some(format!("http://localhost:{}", port));
}
}
}
None
}
/// Creates a new fantoccini Client connected to this chromedriver instance.
///
/// # Errors
///
/// Returns an error if connection to the WebDriver URL fails.
pub async fn new_client(&self) -> Result<Client> {
ClientBuilder::rustls()
.connect(&self.url)
.await
.context("Failed to connect to chromedriver instance")
}
}
impl Drop for ChromeInstance {
fn drop(&mut self) {
// Attempt to kill the process synchronously; for async, caller should handle if needed
if let Ok(status) = self.process.try_wait() {
if status.is_none() {
let _ = self.process.start_kill();
}
}
}
}
/// Represents a scrape task that can be executed asynchronously.
///
/// This struct encapsulates the URL to scrape and a parse function that processes the page
/// using the provided Client. The parse function is async and returns a user-defined type T.
///
/// # Type Parameters
///
/// * `T` - The type of data returned by the parse function.
///
/// # Examples
///
/// ```no_run
/// use crate::scraper::webdriver::ScrapeTask;
/// use fantoccini::Client;
/// use anyhow::Result;
/// use std::pin::Pin;
///
/// async fn example_parse(_client: &Client) -> Result<String> {
/// Ok("Parsed data".to_string())
/// }
///
/// #[tokio::main]
/// async fn main() -> Result<()> {
/// let task: ScrapeTask<String> = ScrapeTask::new(
/// "https://example.com".to_string(),
/// |client| Box::pin(example_parse(client)),
/// );
/// let result = task.execute().await?;
/// println!("{}", result);
/// Ok(())
/// }
/// ```
pub struct ScrapeTask<T> {
url: String,
parse: Box<dyn FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send + 'static>> + Send + 'static>,
}
impl<T: Send + 'static> ScrapeTask<T> {
/// Creates a new ScrapeTask with the given URL and parse function.
///
/// The parse function takes a &Client and returns a future resolving to Result<T>.
pub fn new(
url: String,
parse: impl FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send + 'static,
) -> Self {
Self {
url,
parse: Box::new(parse),
}
}
/// Executes the scrape task by starting a new ChromeInstance, connecting a client,
/// navigating to the URL, running the parse function, and cleaning up.
///
/// This method ensures isolation by using a dedicated chromedriver instance per task.
///
/// # Errors
///
/// Returns an error if:
/// - Failed to start chromedriver instance.
/// - Failed to connect client or navigate to URL.
/// - Parse function returns an error.
/// - Failed to close the client or kill the process.
pub async fn execute(self) -> Result<T> {
let instance = ChromeInstance::new().await.context("Failed to create ChromeInstance")?;
let client = instance.new_client().await.context("Failed to create client")?;
client.goto(&self.url).await.context("Failed to navigate to URL")?;
// Optional: Add common prep like rejecting cookies, waiting for elements, etc.
// This can be customized per task if needed.
let result = (self.parse)(&client).await;
client.close().await.context("Failed to close client")?;
// Instance drops here, killing the process
result
}
}