added creating CompanyInfo mapping
This commit is contained in:
@@ -2,218 +2,268 @@
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use fantoccini::{Client, ClientBuilder};
|
||||
use std::process::{Stdio};
|
||||
use serde_json::{Map, Value};
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||
use tokio::process::{Child, Command};
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tokio::sync::{Mutex, Semaphore};
|
||||
use tokio::time::{Duration, sleep, timeout};
|
||||
use std::pin::Pin;
|
||||
|
||||
/// Manages a pool of ChromeDriver instances for parallel scraping.
|
||||
///
|
||||
/// This struct maintains multiple ChromeDriver processes and allows controlled
|
||||
/// concurrent access via a semaphore. Instances are reused across tasks to avoid
|
||||
/// the overhead of spawning new processes.
|
||||
pub struct ChromeDriverPool {
|
||||
instances: Vec<Arc<Mutex<ChromeInstance>>>,
|
||||
semaphore: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
impl ChromeDriverPool {
|
||||
/// Creates a new pool with the specified number of ChromeDriver instances.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `pool_size` - Number of concurrent ChromeDriver instances to maintain
|
||||
pub async fn new(pool_size: usize) -> Result<Self> {
|
||||
let mut instances = Vec::with_capacity(pool_size);
|
||||
|
||||
println!("Initializing ChromeDriver pool with {} instances...", pool_size);
|
||||
|
||||
for i in 0..pool_size {
|
||||
match ChromeInstance::new().await {
|
||||
Ok(instance) => {
|
||||
println!(" ✓ Instance {} ready", i + 1);
|
||||
instances.push(Arc::new(Mutex::new(instance)));
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!(" ✗ Failed to create instance {}: {}", i + 1, e);
|
||||
// Clean up already created instances
|
||||
drop(instances);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
instances,
|
||||
semaphore: Arc::new(Semaphore::new(pool_size)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Executes a scrape task using an available instance from the pool.
|
||||
pub async fn execute<T, F, Fut>(&self, url: String, parse: F) -> Result<T>
|
||||
where
|
||||
T: Send + 'static,
|
||||
F: FnOnce(Client) -> Fut + Send + 'static,
|
||||
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
||||
{
|
||||
// Acquire semaphore permit
|
||||
let _permit = self.semaphore.acquire().await
|
||||
.map_err(|_| anyhow!("Semaphore closed"))?;
|
||||
|
||||
// Find an available instance (round-robin or first available)
|
||||
let instance = self.instances[0].clone(); // Simple: use first, could be round-robin
|
||||
let mut guard = instance.lock().await;
|
||||
|
||||
// Create a new session for this task
|
||||
let client = guard.new_session().await?;
|
||||
|
||||
// Release lock while we do the actual scraping
|
||||
drop(guard);
|
||||
|
||||
// Navigate and parse
|
||||
client.goto(&url).await.context("Failed to navigate")?;
|
||||
let result = timeout(Duration::from_secs(60), parse(client))
|
||||
.await
|
||||
.context("Parse function timed out after 60s")??;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn get_number_of_instances (&self) -> usize {
|
||||
self.instances.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a single instance of chromedriver process.
|
||||
///
|
||||
/// This struct manages the lifecycle of a chromedriver process, starting it on a random available port
|
||||
/// and providing a way to connect to it via fantoccini Client. Each instance is independent, allowing
|
||||
/// for isolated scraping sessions without interference.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// use crate::scraper::webdriver::ChromeInstance;
|
||||
///
|
||||
/// #[tokio::main]
|
||||
/// async fn main() -> anyhow::Result<()> {
|
||||
/// let instance = ChromeInstance::new().await?;
|
||||
/// let client = instance.new_client().await?;
|
||||
/// // Use client for scraping
|
||||
/// client.close().await?;
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct ChromeInstance {
|
||||
process: Child,
|
||||
url: String,
|
||||
base_url: String,
|
||||
}
|
||||
|
||||
impl ChromeInstance {
|
||||
/// Creates a new ChromeInstance by spawning a chromedriver process on a random port.
|
||||
///
|
||||
/// This function spawns chromedriver with `--port=0` to let it choose an available port,
|
||||
/// reads the stdout to extract the listening URL, and returns the instance if successful.
|
||||
///
|
||||
/// Creates a new ChromeInstance by spawning chromedriver with random port.
|
||||
///
|
||||
/// This spawns `chromedriver --port=0` to avoid port conflicts, reads stdout to extract
|
||||
/// the listening address, and waits for the success message. If timeout occurs or
|
||||
/// spawning fails, returns an error with context.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - chromedriver cannot be spawned (e.g., not found in PATH).
|
||||
/// - Failed to read stdout or parse the listening URL within a reasonable time.
|
||||
///
|
||||
/// Returns an error if chromedriver fails to spawn (e.g., not in PATH, version mismatch),
|
||||
/// if the process exits early, or if the address/success message isn't found within 30s.
|
||||
pub async fn new() -> Result<Self> {
|
||||
let mut child = Command::new("chromedriver")
|
||||
.arg("--port=0")
|
||||
let mut command = Command::new("chromedriver-win64/chromedriver.exe");
|
||||
command
|
||||
.arg("--port=0") // Use random available port to support pooling
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.context("Failed to spawn chromedriver process")?;
|
||||
.stderr(Stdio::piped());
|
||||
|
||||
let stdout = child.stdout.take().context("Failed to take stdout")?;
|
||||
let mut reader = BufReader::new(stdout);
|
||||
let mut line = String::new();
|
||||
let mut process = command
|
||||
.spawn()
|
||||
.context("Failed to spawn chromedriver. Ensure it's installed and in PATH.")?;
|
||||
|
||||
let mut stdout = BufReader::new(
|
||||
process.stdout.take().context("Failed to capture stdout")?
|
||||
).lines();
|
||||
|
||||
let mut stderr = BufReader::new(
|
||||
process.stderr.take().context("Failed to capture stderr")?
|
||||
).lines();
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
let timeout = Duration::from_secs(10);
|
||||
let mut address: Option<String> = None;
|
||||
let mut success = false;
|
||||
|
||||
loop {
|
||||
if start_time.elapsed() > timeout {
|
||||
let _ = child.kill().await;
|
||||
return Err(anyhow!("Timeout waiting for chromedriver to start"));
|
||||
// Log stderr in background for debugging
|
||||
tokio::spawn(async move {
|
||||
while let Ok(Some(line)) = stderr.next_line().await {
|
||||
eprintln!("ChromeDriver stderr: {}", line);
|
||||
}
|
||||
});
|
||||
|
||||
line.clear();
|
||||
if reader.read_line(&mut line).await.context("Failed to read line from stdout")? == 0 {
|
||||
// EOF reached unexpectedly
|
||||
let mut stderr_output = String::new();
|
||||
if let Some(mut stderr) = child.stderr.take() {
|
||||
let mut stderr_reader = BufReader::new(&mut stderr);
|
||||
let mut stderr_line = String::new();
|
||||
while stderr_reader.read_line(&mut stderr_line).await? > 0 {
|
||||
stderr_output.push_str(&stderr_line);
|
||||
stderr_line.clear();
|
||||
}
|
||||
// Wait for address and success (up to 30s)
|
||||
while start_time.elapsed() < Duration::from_secs(30) {
|
||||
if let Ok(Ok(Some(line))) =
|
||||
timeout(Duration::from_secs(1), stdout.next_line()).await
|
||||
{
|
||||
if let Some(addr) = parse_chromedriver_address(&line) {
|
||||
address = Some(addr.to_string());
|
||||
}
|
||||
let _ = child.kill().await;
|
||||
return Err(anyhow!("Chromedriver exited unexpectedly. Stderr: {}", stderr_output));
|
||||
}
|
||||
|
||||
if let Some(url) = Self::extract_url(&line) {
|
||||
return Ok(Self {
|
||||
process: child,
|
||||
url,
|
||||
});
|
||||
if line.contains("ChromeDriver was started successfully") {
|
||||
success = true;
|
||||
}
|
||||
|
||||
if let (Some(addr), true) = (&address, success) {
|
||||
return Ok(Self {
|
||||
process,
|
||||
base_url: addr.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
|
||||
// Cleanup on failure
|
||||
let _ = process.kill().await;
|
||||
Err(anyhow!("Timeout: ChromeDriver did not start within 30 seconds. Check version match with Chrome browser and system resources."))
|
||||
}
|
||||
|
||||
/// Extracts the listening URL from chromedriver's output line.
|
||||
///
|
||||
/// Looks for lines like "Starting ChromeDriver ... port=XXXX" or "Listening on 127.0.0.1:XXXX".
|
||||
/// Returns the full URL like "http://127.0.0.1:XXXX" if found.
|
||||
fn extract_url(line: &str) -> Option<String> {
|
||||
if line.contains("Listening on") || line.contains("port=") {
|
||||
// Simple regex-like parsing; adjust based on actual output
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
for part in parts {
|
||||
if part.starts_with("127.0.0.1:") || part.starts_with("localhost:") {
|
||||
return Some(format!("http://{}", part));
|
||||
} else if part.starts_with("port=") {
|
||||
let port = part.split('=').nth(1)?;
|
||||
/// Creates a new browser session (client) from this ChromeDriver instance.
|
||||
/// Each session is independent and can be closed without affecting the driver.
|
||||
pub async fn new_session(&self) -> Result<Client> {
|
||||
ClientBuilder::native()
|
||||
.capabilities(Self::chrome_args())
|
||||
.connect(&self.base_url)
|
||||
.await
|
||||
.context("Failed to create new session")
|
||||
}
|
||||
|
||||
fn chrome_args() -> Map<String, Value> {
|
||||
let args = serde_json::json!({
|
||||
"goog:chromeOptions": {
|
||||
"args": [
|
||||
"--headless=new",
|
||||
"--disable-gpu",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-infobars",
|
||||
"--disable-extensions",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-notifications",
|
||||
"--disable-logging",
|
||||
"--disable-autofill",
|
||||
"--disable-features=TranslateUI,OptimizationGuideModelDownloading",
|
||||
"--window-size=1920,1080",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
],
|
||||
"excludeSwitches": ["enable-logging", "enable-automation"],
|
||||
"useAutomationExtension": false,
|
||||
"prefs": {
|
||||
"profile.default_content_setting_values.notifications": 2
|
||||
}
|
||||
}
|
||||
});
|
||||
args.as_object()
|
||||
.expect("Capabilities should be a JSON object")
|
||||
.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses the ChromeDriver address from a log line.
|
||||
///
|
||||
/// Looks for the "Starting ChromeDriver ... on port XXXX" line and extracts the port.
|
||||
/// Returns `Some("http://localhost:XXXX")` if found, else `None`.
|
||||
fn parse_chromedriver_address(line: &str) -> Option<String> {
|
||||
if line.contains("Starting ChromeDriver") {
|
||||
if let Some(port_str) = line.split("on port ").nth(1) {
|
||||
if let Some(port) = port_str.split_whitespace().next() {
|
||||
if port.parse::<u16>().is_ok() {
|
||||
return Some(format!("http://localhost:{}", port));
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Creates a new fantoccini Client connected to this chromedriver instance.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if connection to the WebDriver URL fails.
|
||||
pub async fn new_client(&self) -> Result<Client> {
|
||||
ClientBuilder::rustls()
|
||||
.connect(&self.url)
|
||||
.await
|
||||
.context("Failed to connect to chromedriver instance")
|
||||
// Fallback for other formats (e.g., explicit port mentions)
|
||||
for word in line.split_whitespace() {
|
||||
if let Ok(port) = word.trim_matches(|c: char| !c.is_numeric()).parse::<u16>() {
|
||||
if port > 1024 && port < 65535 && line.to_lowercase().contains("port") {
|
||||
return Some(format!("http://localhost:{}", port));
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
impl Drop for ChromeInstance {
|
||||
fn drop(&mut self) {
|
||||
// Attempt to kill the process synchronously; for async, caller should handle if needed
|
||||
if let Ok(status) = self.process.try_wait() {
|
||||
if status.is_none() {
|
||||
let _ = self.process.start_kill();
|
||||
}
|
||||
}
|
||||
let _ = self.process.start_kill();
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a scrape task that can be executed asynchronously.
|
||||
/// Simplified task execution - now uses the pool pattern.
|
||||
///
|
||||
/// This struct encapsulates the URL to scrape and a parse function that processes the page
|
||||
/// using the provided Client. The parse function is async and returns a user-defined type T.
|
||||
///
|
||||
/// # Type Parameters
|
||||
///
|
||||
/// * `T` - The type of data returned by the parse function.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// use crate::scraper::webdriver::ScrapeTask;
|
||||
/// use fantoccini::Client;
|
||||
/// use anyhow::Result;
|
||||
/// use std::pin::Pin;
|
||||
///
|
||||
/// async fn example_parse(_client: &Client) -> Result<String> {
|
||||
/// Ok("Parsed data".to_string())
|
||||
/// }
|
||||
///
|
||||
/// #[tokio::main]
|
||||
/// async fn main() -> Result<()> {
|
||||
/// let task: ScrapeTask<String> = ScrapeTask::new(
|
||||
/// "https://example.com".to_string(),
|
||||
/// |client| Box::pin(example_parse(client)),
|
||||
/// );
|
||||
/// let result = task.execute().await?;
|
||||
/// println!("{}", result);
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
/// For backwards compatibility with existing code.
|
||||
pub struct ScrapeTask<T> {
|
||||
url: String,
|
||||
parse: Box<dyn FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send + 'static>> + Send + 'static>,
|
||||
parse: Box<dyn FnOnce(Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send>,
|
||||
}
|
||||
|
||||
impl<T: Send + 'static> ScrapeTask<T> {
|
||||
/// Creates a new ScrapeTask with the given URL and parse function.
|
||||
///
|
||||
/// The parse function takes a &Client and returns a future resolving to Result<T>.
|
||||
pub fn new(
|
||||
url: String,
|
||||
parse: impl FnOnce(&Client) -> Pin<Box<dyn std::future::Future<Output = Result<T>> + Send>> + Send + 'static,
|
||||
) -> Self {
|
||||
pub fn new<F, Fut>(url: String, parse: F) -> Self
|
||||
where
|
||||
F: FnOnce(Client) -> Fut + Send + 'static,
|
||||
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
||||
{
|
||||
Self {
|
||||
url,
|
||||
parse: Box::new(parse),
|
||||
parse: Box::new(move |client| Box::pin(parse(client))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes the scrape task by starting a new ChromeInstance, connecting a client,
|
||||
/// navigating to the URL, running the parse function, and cleaning up.
|
||||
///
|
||||
/// This method ensures isolation by using a dedicated chromedriver instance per task.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - Failed to start chromedriver instance.
|
||||
/// - Failed to connect client or navigate to URL.
|
||||
/// - Parse function returns an error.
|
||||
/// - Failed to close the client or kill the process.
|
||||
pub async fn execute(self) -> Result<T> {
|
||||
let instance = ChromeInstance::new().await.context("Failed to create ChromeInstance")?;
|
||||
let client = instance.new_client().await.context("Failed to create client")?;
|
||||
|
||||
client.goto(&self.url).await.context("Failed to navigate to URL")?;
|
||||
|
||||
// Optional: Add common prep like rejecting cookies, waiting for elements, etc.
|
||||
// This can be customized per task if needed.
|
||||
|
||||
let result = (self.parse)(&client).await;
|
||||
|
||||
client.close().await.context("Failed to close client")?;
|
||||
|
||||
// Instance drops here, killing the process
|
||||
|
||||
result
|
||||
/// Executes using a provided pool (more efficient for multiple tasks).
|
||||
pub async fn execute_with_pool(self, pool: &ChromeDriverPool) -> Result<T> {
|
||||
let url = self.url;
|
||||
let parse = self.parse;
|
||||
|
||||
pool.execute(url, move |client| async move {
|
||||
(parse)(client).await
|
||||
}).await
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user