working api calls

2025-12-02 17:10:34 +01:00
parent de875a3ebe
commit 95fd9ca141
6 changed files with 1104 additions and 323 deletions
--- a/src/scraper/webdriver.rs
+++ b/src/scraper/webdriver.rs
@@ -0,0 +1,219 @@
+// src/scraper/webdriver.rs
+
+use anyhow::{anyhow, Context, Result};
+use fantoccini::{Client, ClientBuilder};
+use std::process::{Stdio};
+use tokio::io::{AsyncBufReadExt, BufReader};
+use tokio::process::{Child, Command};
+use tokio::time::{sleep, Duration};
+use std::pin::Pin;
+
+/// Represents a single instance of chromedriver process.
+/// 
+/// This struct manages the lifecycle of a chromedriver process, starting it on a random available port
+/// and providing a way to connect to it via fantoccini Client. Each instance is independent, allowing
+/// for isolated scraping sessions without interference.
+/// 
+/// # Examples
+/// 
+/// ```no_run
+/// use crate::scraper::webdriver::ChromeInstance;
+/// 
+/// #[tokio::main]
+/// async fn main() -> anyhow::Result<()> {
+///     let instance = ChromeInstance::new().await?;
+///     let client = instance.new_client().await?;
+///     // Use client for scraping
+///     client.close().await?;
+///     Ok(())
+/// }
+/// ```
+pub struct ChromeInstance {
+    process: Child,
+    url: String,
+}
+
+impl ChromeInstance {
+    /// Creates a new ChromeInstance by spawning a chromedriver process on a random port.
+    /// 
+    /// This function spawns chromedriver with `--port=0` to let it choose an available port,
+    /// reads the stdout to extract the listening URL, and returns the instance if successful.
+    /// 
+    /// # Errors
+    /// 
+    /// Returns an error if:
+    /// - chromedriver cannot be spawned (e.g., not found in PATH).
+    /// - Failed to read stdout or parse the listening URL within a reasonable time.
+    pub async fn new() -> Result<Self> {
+        let mut child = Command::new("chromedriver")
+            .arg("--port=0")
+            .stdout(Stdio::piped())
+            .stderr(Stdio::piped())
+            .spawn()
+            .context("Failed to spawn chromedriver process")?;
+
+        let stdout = child.stdout.take().context("Failed to take stdout")?;
+        let mut reader = BufReader::new(stdout);
+        let mut line = String::new();
+
+        let start_time = std::time::Instant::now();
+        let timeout = Duration::from_secs(10);
+
+        loop {
+            if start_time.elapsed() > timeout {
+                let _ = child.kill().await;
+                return Err(anyhow!("Timeout waiting for chromedriver to start"));
+            }
+
+            line.clear();
+            if reader.read_line(&mut line).await.context("Failed to read line from stdout")? == 0 {
+                // EOF reached unexpectedly
+                let mut stderr_output = String::new();
+                if let Some(mut stderr) = child.stderr.take() {
+                    let mut stderr_reader = BufReader::new(&mut stderr);
+                    let mut stderr_line = String::new();
+                    while stderr_reader.read_line(&mut stderr_line).await? > 0 {
+                        stderr_output.push_str(&stderr_line);
+                        stderr_line.clear();
+                    }
+                }
+                let _ = child.kill().await;
+                return Err(anyhow!("Chromedriver exited unexpectedly. Stderr: {}", stderr_output));
+            }
+
+            if let Some(url) = Self::extract_url(&line) {
+                return Ok(Self {
+                    process: child,
+                    url,
+                });
+            }
+
+            sleep(Duration::from_millis(100)).await;
+        }
+    }
+
+    /// Extracts the listening URL from chromedriver's output line.
+    /// 
+    /// Looks for lines like "Starting ChromeDriver ... port=XXXX" or "Listening on 127.0.0.1:XXXX".
+    /// Returns the full URL like "http://127.0.0.1:XXXX" if found.
+    fn extract_url(line: &str) -> Option<String> {
+        if line.contains("Listening on") || line.contains("port=") {
+            // Simple regex-like parsing; adjust based on actual output
+            let parts: Vec<&str> = line.split_whitespace().collect();
+            for part in parts {
+                if part.starts_with("127.0.0.1:") || part.starts_with("localhost:") {
+                    return Some(format!("http://{}", part));
+                } else if part.starts_with("port=") {
+                    let port = part.split('=').nth(1)?;
+                    return Some(format!("http://localhost:{}", port));
+                }
+            }
+        }
+        None
+    }
+
+    /// Creates a new fantoccini Client connected to this chromedriver instance.
+    /// 
+    /// # Errors
+    /// 
+    /// Returns an error if connection to the WebDriver URL fails.
+    pub async fn new_client(&self) -> Result<Client> {
+        ClientBuilder::rustls()
+            .connect(&self.url)
+            .await
+            .context("Failed to connect to chromedriver instance")
+    }
+}
+
+impl Drop for ChromeInstance {
+    fn drop(&mut self) {
+        // Attempt to kill the process synchronously; for async, caller should handle if needed
+        if let Ok(status) = self.process.try_wait() {
+            if status.is_none() {
+                let _ = self.process.start_kill();
+            }
+        }
+    }
+}
+
+/// Represents a scrape task that can be executed asynchronously.
+/// 
+/// This struct encapsulates the URL to scrape and a parse function that processes the page
+/// using the provided Client. The parse function is async and returns a user-defined type T.
+/// 
+/// # Type Parameters
+/// 
+/// * `T` - The type of data returned by the parse function.
+/// 
+/// # Examples
+/// 
+/// ```no_run
+/// use crate::scraper::webdriver::ScrapeTask;
+/// use fantoccini::Client;
+/// use anyhow::Result;
+/// use std::pin::Pin;
+/// 
+/// async fn example_parse(_client: &Client) -> Result<String> {
+///     Ok("Parsed data".to_string())
+/// }
+/// 
+/// #[tokio::main]
+/// async fn main() -> Result<()> {
+///     let task: ScrapeTask<String> = ScrapeTask::new(
+///         "https://example.com".to_string(),
+///         |client| Box::pin(example_parse(client)),
+///     );
+///     let result = task.execute().await?;
+///     println!("{}", result);
+///     Ok(())
+/// }
+/// ```
+pub struct ScrapeTask<T> {
+    url: String,
+    parse: Box<dyn FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send + 'static>> + Send + 'static>,
+}
+
+impl<T: Send + 'static> ScrapeTask<T> {
+    /// Creates a new ScrapeTask with the given URL and parse function.
+    /// 
+    /// The parse function takes a &Client and returns a future resolving to Result<T>.
+    pub fn new(
+        url: String,
+        parse: impl FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send + 'static,
+    ) -> Self {
+        Self {
+            url,
+            parse: Box::new(parse),
+        }
+    }
+
+    /// Executes the scrape task by starting a new ChromeInstance, connecting a client,
+    /// navigating to the URL, running the parse function, and cleaning up.
+    /// 
+    /// This method ensures isolation by using a dedicated chromedriver instance per task.
+    /// 
+    /// # Errors
+    /// 
+    /// Returns an error if:
+    /// - Failed to start chromedriver instance.
+    /// - Failed to connect client or navigate to URL.
+    /// - Parse function returns an error.
+    /// - Failed to close the client or kill the process.
+    pub async fn execute(self) -> Result<T> {
+        let instance = ChromeInstance::new().await.context("Failed to create ChromeInstance")?;
+        let client = instance.new_client().await.context("Failed to create client")?;
+
+        client.goto(&self.url).await.context("Failed to navigate to URL")?;
+
+        // Optional: Add common prep like rejecting cookies, waiting for elements, etc.
+        // This can be customized per task if needed.
+
+        let result = (self.parse)(&client).await;
+
+        client.close().await.context("Failed to close client")?;
+
+        // Instance drops here, killing the process
+
+        result
+    }
+}