added creating CompanyInfo mapping

2025-12-04 13:33:32 +01:00
parent 95fd9ca141
commit ef2393ab70
13 changed files with 965 additions and 696 deletions
--- a/src/scraper/webdriver.rs
+++ b/src/scraper/webdriver.rs
@@ -2,218 +2,268 @@

 use anyhow::{anyhow, Context, Result};
 use fantoccini::{Client, ClientBuilder};
-use std::process::{Stdio};
+use serde_json::{Map, Value};
+use std::process::Stdio;
+use std::sync::Arc;
 use tokio::io::{AsyncBufReadExt, BufReader};
 use tokio::process::{Child, Command};
-use tokio::time::{sleep, Duration};
+use tokio::sync::{Mutex, Semaphore};
+use tokio::time::{Duration, sleep, timeout};
 use std::pin::Pin;

+/// Manages a pool of ChromeDriver instances for parallel scraping.
+/// 
+/// This struct maintains multiple ChromeDriver processes and allows controlled
+/// concurrent access via a semaphore. Instances are reused across tasks to avoid
+/// the overhead of spawning new processes.
+pub struct ChromeDriverPool {
+    instances: Vec<Arc<Mutex<ChromeInstance>>>,
+    semaphore: Arc<Semaphore>,
+}
+
+impl ChromeDriverPool {
+    /// Creates a new pool with the specified number of ChromeDriver instances.
+    /// 
+    /// # Arguments
+    /// * `pool_size` - Number of concurrent ChromeDriver instances to maintain
+    pub async fn new(pool_size: usize) -> Result<Self> {
+        let mut instances = Vec::with_capacity(pool_size);
+        
+        println!("Initializing ChromeDriver pool with {} instances...", pool_size);
+        
+        for i in 0..pool_size {
+            match ChromeInstance::new().await {
+                Ok(instance) => {
+                    println!("  ✓ Instance {} ready", i + 1);
+                    instances.push(Arc::new(Mutex::new(instance)));
+                }
+                Err(e) => {
+                    eprintln!("  ✗ Failed to create instance {}: {}", i + 1, e);
+                    // Clean up already created instances
+                    drop(instances);
+                    return Err(e);
+                }
+            }
+        }
+        
+        Ok(Self {
+            instances,
+            semaphore: Arc::new(Semaphore::new(pool_size)),
+        })
+    }
+
+    /// Executes a scrape task using an available instance from the pool.
+    pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
+    where
+        T: Send + 'static,
+        F: FnOnce(Client) -> Fut + Send + 'static,
+        Fut: std::future::Future<Output = Result<T>> + Send + 'static,
+    {
+        // Acquire semaphore permit
+        let _permit = self.semaphore.acquire().await
+            .map_err(|_| anyhow!("Semaphore closed"))?;
+
+        // Find an available instance (round-robin or first available)
+        let instance = self.instances[0].clone(); // Simple: use first, could be round-robin
+        let mut guard = instance.lock().await;
+
+        // Create a new session for this task
+        let client = guard.new_session().await?;
+        
+        // Release lock while we do the actual scraping
+        drop(guard);
+
+        // Navigate and parse
+        client.goto(&url).await.context("Failed to navigate")?;
+        let result = timeout(Duration::from_secs(60), parse(client))
+            .await
+            .context("Parse function timed out after 60s")??;
+
+        Ok(result)
+    }
+
+    pub fn get_number_of_instances (&self) -> usize {
+      self.instances.len()
+    }
+}
+
 /// Represents a single instance of chromedriver process.
-/// 
-/// This struct manages the lifecycle of a chromedriver process, starting it on a random available port
-/// and providing a way to connect to it via fantoccini Client. Each instance is independent, allowing
-/// for isolated scraping sessions without interference.
-/// 
-/// # Examples
-/// 
-/// ```no_run
-/// use crate::scraper::webdriver::ChromeInstance;
-/// 
-/// #[tokio::main]
-/// async fn main() -> anyhow::Result<()> {
-///     let instance = ChromeInstance::new().await?;
-///     let client = instance.new_client().await?;
-///     // Use client for scraping
-///     client.close().await?;
-///     Ok(())
-/// }
-/// ```
 pub struct ChromeInstance {
    process: Child,
-    url: String,
+    base_url: String,
 }

 impl ChromeInstance {
-    /// Creates a new ChromeInstance by spawning a chromedriver process on a random port.
-    /// 
-    /// This function spawns chromedriver with `--port=0` to let it choose an available port,
-    /// reads the stdout to extract the listening URL, and returns the instance if successful.
-    /// 
+/// Creates a new ChromeInstance by spawning chromedriver with random port.
+    ///
+    /// This spawns `chromedriver --port=0` to avoid port conflicts, reads stdout to extract
+    /// the listening address, and waits for the success message. If timeout occurs or
+    /// spawning fails, returns an error with context.
+    ///
    /// # Errors
-    /// 
-    /// Returns an error if:
-    /// - chromedriver cannot be spawned (e.g., not found in PATH).
-    /// - Failed to read stdout or parse the listening URL within a reasonable time.
+    ///
+    /// Returns an error if chromedriver fails to spawn (e.g., not in PATH, version mismatch),
+    /// if the process exits early, or if the address/success message isn't found within 30s.
    pub async fn new() -> Result<Self> {
-        let mut child = Command::new("chromedriver")
-            .arg("--port=0")
+        let mut command = Command::new("chromedriver-win64/chromedriver.exe");
+        command
+            .arg("--port=0")  // Use random available port to support pooling
            .stdout(Stdio::piped())
-            .stderr(Stdio::piped())
-            .spawn()
-            .context("Failed to spawn chromedriver process")?;
+            .stderr(Stdio::piped());

-        let stdout = child.stdout.take().context("Failed to take stdout")?;
-        let mut reader = BufReader::new(stdout);
-        let mut line = String::new();
+        let mut process = command
+            .spawn()
+            .context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
+
+        let mut stdout = BufReader::new(
+            process.stdout.take().context("Failed to capture stdout")?
+        ).lines();
+
+        let mut stderr = BufReader::new(
+            process.stderr.take().context("Failed to capture stderr")?
+        ).lines();

        let start_time = std::time::Instant::now();
-        let timeout = Duration::from_secs(10);
+        let mut address: Option<String> = None;
+        let mut success = false;

-        loop {
-            if start_time.elapsed() > timeout {
-                let _ = child.kill().await;
-                return Err(anyhow!("Timeout waiting for chromedriver to start"));
+        // Log stderr in background for debugging
+        tokio::spawn(async move {
+            while let Ok(Some(line)) = stderr.next_line().await {
+                eprintln!("ChromeDriver stderr: {}", line);
            }
+        });

-            line.clear();
-            if reader.read_line(&mut line).await.context("Failed to read line from stdout")? == 0 {
-                // EOF reached unexpectedly
-                let mut stderr_output = String::new();
-                if let Some(mut stderr) = child.stderr.take() {
-                    let mut stderr_reader = BufReader::new(&mut stderr);
-                    let mut stderr_line = String::new();
-                    while stderr_reader.read_line(&mut stderr_line).await? > 0 {
-                        stderr_output.push_str(&stderr_line);
-                        stderr_line.clear();
-                    }
+        // Wait for address and success (up to 30s)
+        while start_time.elapsed() < Duration::from_secs(30) {
+            if let Ok(Ok(Some(line))) =
+                timeout(Duration::from_secs(1), stdout.next_line()).await
+            {
+                if let Some(addr) = parse_chromedriver_address(&line) {
+                    address = Some(addr.to_string());
                }
-                let _ = child.kill().await;
-                return Err(anyhow!("Chromedriver exited unexpectedly. Stderr: {}", stderr_output));
-            }

-            if let Some(url) = Self::extract_url(&line) {
-                return Ok(Self {
-                    process: child,
-                    url,
-                });
+                if line.contains("ChromeDriver was started successfully") {
+                    success = true;
+                }
+
+                if let (Some(addr), true) = (&address, success) {
+                    return Ok(Self {
+                        process,
+                        base_url: addr.clone(),
+                    });
+                }
            }

            sleep(Duration::from_millis(100)).await;
        }
+
+        // Cleanup on failure
+        let _ = process.kill().await;
+        Err(anyhow!("Timeout: ChromeDriver did not start within 30 seconds. Check version match with Chrome browser and system resources."))
    }

-    /// Extracts the listening URL from chromedriver's output line.
-    /// 
-    /// Looks for lines like "Starting ChromeDriver ... port=XXXX" or "Listening on 127.0.0.1:XXXX".
-    /// Returns the full URL like "http://127.0.0.1:XXXX" if found.
-    fn extract_url(line: &str) -> Option<String> {
-        if line.contains("Listening on") || line.contains("port=") {
-            // Simple regex-like parsing; adjust based on actual output
-            let parts: Vec<&str> = line.split_whitespace().collect();
-            for part in parts {
-                if part.starts_with("127.0.0.1:") || part.starts_with("localhost:") {
-                    return Some(format!("http://{}", part));
-                } else if part.starts_with("port=") {
-                    let port = part.split('=').nth(1)?;
+    /// Creates a new browser session (client) from this ChromeDriver instance.
+    /// Each session is independent and can be closed without affecting the driver.
+    pub async fn new_session(&self) -> Result<Client> {
+        ClientBuilder::native()
+            .capabilities(Self::chrome_args())
+            .connect(&self.base_url)
+            .await
+            .context("Failed to create new session")
+    }
+
+    fn chrome_args() -> Map<String, Value> {
+        let args = serde_json::json!({
+            "goog:chromeOptions": {
+                "args": [
+                    "--headless=new",
+                    "--disable-gpu",
+                    "--no-sandbox",
+                    "--disable-dev-shm-usage",
+                    "--disable-infobars",
+                    "--disable-extensions",
+                    "--disable-popup-blocking",
+                    "--disable-notifications",
+                    "--disable-logging",
+                    "--disable-autofill",
+                    "--disable-features=TranslateUI,OptimizationGuideModelDownloading",
+                    "--window-size=1920,1080",
+                    "--disable-blink-features=AutomationControlled",
+                    "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+                ],
+                "excludeSwitches": ["enable-logging", "enable-automation"],
+                "useAutomationExtension": false,
+                "prefs": {
+                    "profile.default_content_setting_values.notifications": 2
+                }
+            }
+        });
+        args.as_object()
+          .expect("Capabilities should be a JSON object")
+          .clone()
+    }
+}
+
+/// Parses the ChromeDriver address from a log line.
+///
+/// Looks for the "Starting ChromeDriver ... on port XXXX" line and extracts the port.
+/// Returns `Some("http://localhost:XXXX")` if found, else `None`.
+fn parse_chromedriver_address(line: &str) -> Option<String> {
+    if line.contains("Starting ChromeDriver") {
+        if let Some(port_str) = line.split("on port ").nth(1) {
+            if let Some(port) = port_str.split_whitespace().next() {
+                if port.parse::<u16>().is_ok() {
                    return Some(format!("http://localhost:{}", port));
                }
            }
        }
-        None
    }
-
-    /// Creates a new fantoccini Client connected to this chromedriver instance.
-    /// 
-    /// # Errors
-    /// 
-    /// Returns an error if connection to the WebDriver URL fails.
-    pub async fn new_client(&self) -> Result<Client> {
-        ClientBuilder::rustls()
-            .connect(&self.url)
-            .await
-            .context("Failed to connect to chromedriver instance")
+    // Fallback for other formats (e.g., explicit port mentions)
+    for word in line.split_whitespace() {
+        if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::<u16>() {
+            if port > 1024 && port < 65535 && line.to_lowercase().contains("port") {
+                return Some(format!("http://localhost:{}", port));
+            }
+        }
    }
+    None
 }

 impl Drop for ChromeInstance {
    fn drop(&mut self) {
-        // Attempt to kill the process synchronously; for async, caller should handle if needed
-        if let Ok(status) = self.process.try_wait() {
-            if status.is_none() {
-                let _ = self.process.start_kill();
-            }
-        }
+        let _ = self.process.start_kill();
+        std::thread::sleep(std::time::Duration::from_millis(100));
    }
 }

-/// Represents a scrape task that can be executed asynchronously.
+/// Simplified task execution - now uses the pool pattern.
 /// 
-/// This struct encapsulates the URL to scrape and a parse function that processes the page
-/// using the provided Client. The parse function is async and returns a user-defined type T.
-/// 
-/// # Type Parameters
-/// 
-/// * `T` - The type of data returned by the parse function.
-/// 
-/// # Examples
-/// 
-/// ```no_run
-/// use crate::scraper::webdriver::ScrapeTask;
-/// use fantoccini::Client;
-/// use anyhow::Result;
-/// use std::pin::Pin;
-/// 
-/// async fn example_parse(_client: &Client) -> Result<String> {
-///     Ok("Parsed data".to_string())
-/// }
-/// 
-/// #[tokio::main]
-/// async fn main() -> Result<()> {
-///     let task: ScrapeTask<String> = ScrapeTask::new(
-///         "https://example.com".to_string(),
-///         |client| Box::pin(example_parse(client)),
-///     );
-///     let result = task.execute().await?;
-///     println!("{}", result);
-///     Ok(())
-/// }
-/// ```
+/// For backwards compatibility with existing code.
 pub struct ScrapeTask<T> {
    url: String,
-    parse: Box<dyn FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send + 'static>> + Send + 'static>,
+    parse: Box<dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send>,
 }

 impl<T: Send + 'static> ScrapeTask<T> {
-    /// Creates a new ScrapeTask with the given URL and parse function.
-    /// 
-    /// The parse function takes a &Client and returns a future resolving to Result<T>.
-    pub fn new(
-        url: String,
-        parse: impl FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send + 'static,
-    ) -> Self {
+    pub fn new<F, Fut>(url: String, parse: F) -> Self
+    where
+        F: FnOnce(Client) -> Fut + Send + 'static,
+        Fut: std::future::Future<Output = Result<T>> + Send + 'static,
+    {
        Self {
            url,
-            parse: Box::new(parse),
+            parse: Box::new(move |client| Box::pin(parse(client))),
        }
    }

-    /// Executes the scrape task by starting a new ChromeInstance, connecting a client,
-    /// navigating to the URL, running the parse function, and cleaning up.
-    /// 
-    /// This method ensures isolation by using a dedicated chromedriver instance per task.
-    /// 
-    /// # Errors
-    /// 
-    /// Returns an error if:
-    /// - Failed to start chromedriver instance.
-    /// - Failed to connect client or navigate to URL.
-    /// - Parse function returns an error.
-    /// - Failed to close the client or kill the process.
-    pub async fn execute(self) -> Result<T> {
-        let instance = ChromeInstance::new().await.context("Failed to create ChromeInstance")?;
-        let client = instance.new_client().await.context("Failed to create client")?;
-
-        client.goto(&self.url).await.context("Failed to navigate to URL")?;
-
-        // Optional: Add common prep like rejecting cookies, waiting for elements, etc.
-        // This can be customized per task if needed.
-
-        let result = (self.parse)(&client).await;
-
-        client.close().await.context("Failed to close client")?;
-
-        // Instance drops here, killing the process
-
-        result
+    /// Executes using a provided pool (more efficient for multiple tasks).
+    pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
+        let url = self.url;
+        let parse = self.parse;
+        
+        pool.execute(url, move |client| async move {
+            (parse)(client).await
+        }).await
    }
 }