added parallelized scraping instances for company yahoo ticker seeding

This commit is contained in:
2025-12-18 13:05:23 +01:00
parent d26e833d93
commit 9c66f0d361
7 changed files with 842 additions and 68 deletions

View File

@@ -2,6 +2,9 @@
use anyhow::{anyhow, Context, Result};
use fantoccini::{Client, ClientBuilder};
use rand::seq::{IndexedRandom, SliceRandom};
use rand::rngs::ThreadRng;
use rand::Rng; // for the RNG trait
use serde_json::{Map, Value};
use std::pin::Pin;
use std::process::Stdio;
@@ -363,6 +366,7 @@ impl ChromeInstance {
}
fn chrome_args(&self) -> Map<String, Value> {
let user_agent = Self::chrome_user_agent();
let mut args = vec![
"--headless=new".to_string(),
"--disable-gpu".to_string(),
@@ -372,14 +376,14 @@ impl ChromeInstance {
"--disable-extensions".to_string(),
"--disable-popup-blocking".to_string(),
"--disable-notifications".to_string(),
"--disable-logging".to_string(),
//"--disable-logging".to_string(),
"--disable-autofill".to_string(),
"--disable-sync".to_string(),
"--disable-default-apps".to_string(),
"--disable-translate".to_string(),
"--window-size=1920,1080".to_string(),
//"--window-size=1920,1080".to_string(),
"--disable-blink-features=AutomationControlled".to_string(),
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36".to_string()
format!("--user-agent={}", user_agent),
];
if let Some(ref proxy) = self.proxy_url {
let proxy = proxy.clone();
@@ -397,6 +401,18 @@ impl ChromeInstance {
});
caps.as_object().cloned().unwrap()
}
pub fn chrome_user_agent() -> &'static str {
static UAS: &[&str] = &[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
];
let mut rng = ThreadRng::default(); // non-deprecated RNG
*UAS.choose(&mut rng).unwrap()
}
}
fn parse_chromedriver_address(line: &str) -> Option<String> {