moved data capturing into cache folder

This commit is contained in:
2025-12-05 22:32:42 +01:00
parent 58a498e694
commit a6823dc938
6 changed files with 230 additions and 80 deletions

View File

@@ -1,7 +1,7 @@
// src/corporate/scraper.rs
use super::{types::*, helpers::*, openfigi::*};
//use crate::corporate::openfigi::OpenFigiClient;
use crate::{webdriver::webdriver::*};
use crate::{webdriver::webdriver::*, util::directories::DataPaths, util::logger};
use fantoccini::{Client, Locator};
use scraper::{Html, Selector};
use chrono::{DateTime, Duration, NaiveDate, Utc};
@@ -490,11 +490,19 @@ pub async fn _fetch_latest_gleif_isin_lei_mapping_url(client: &Client) -> anyhow
pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
let url = "https://mapping.gleif.org/api/v2/isin-lei/9315e3e3-305a-4e71-b062-46714740fa8d/download";
if let Err(e) = std::fs::create_dir_all("data/gleif") {
println!("Failed to create data directory: {e}");
// Initialize DataPaths and create cache/gleif directory
let paths = DataPaths::new(".")?;
let gleif_cache_dir = paths.cache_gleif_dir();
if let Err(e) = std::fs::create_dir_all(&gleif_cache_dir) {
let msg = format!("Failed to create cache/gleif directory: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
logger::log_info("Corporate Scraper: Downloading ISIN/LEI mapping from GLEIF...").await;
// Download ZIP and get the filename from Content-Disposition header
let client = match reqwest::Client::builder()
.user_agent(USER_AGENT)
@@ -503,7 +511,9 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
{
Ok(c) => c,
Err(e) => {
println!("Failed to create HTTP client: {e}");
let msg = format!("Failed to create HTTP client: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
};
@@ -511,11 +521,15 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
let resp = match client.get(url).send().await {
Ok(r) if r.status().is_success() => r,
Ok(resp) => {
println!("Server returned HTTP {}", resp.status());
let msg = format!("Server returned HTTP {}", resp.status());
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
Err(e) => {
println!("Failed to download ISIN/LEI ZIP: {e}");
let msg = format!("Failed to download ISIN/LEI ZIP: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
};
@@ -528,21 +542,30 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
.and_then(|s| s.split("filename=").nth(1).map(|f| f.trim_matches('"').to_string()))
.unwrap_or_else(|| "isin_lei.zip".to_string());
// Parse timestamp from filename and convert to DDMMYYYY format
let parsed_filename = parse_gleif_filename(&filename);
logger::log_info(&format!("Corporate Scraper: Downloaded file: {} -> {}", filename, parsed_filename)).await;
let bytes = match resp.bytes().await {
Ok(b) => b,
Err(e) => {
println!("Failed to read ZIP bytes: {e}");
let msg = format!("Failed to read ZIP bytes: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
};
let zip_path = format!("data/gleif/{}", filename);
let csv_path = format!("data/gleif/{}", filename.replace(".zip", ".csv"));
let zip_path = gleif_cache_dir.join(&parsed_filename);
let csv_path = gleif_cache_dir.join(parsed_filename.replace(".zip", ".csv"));
if let Err(e) = tokio::fs::write(&zip_path, &bytes).await {
println!("Failed to write ZIP file: {e}");
let msg = format!("Failed to write ZIP file: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
logger::log_info(&format!("Corporate Scraper: Saved ZIP to {:?}", zip_path)).await;
// Extract CSV
let archive = match std::fs::File::open(&zip_path)
@@ -550,11 +573,15 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
{
Ok(Ok(a)) => a,
Ok(Err(e)) => {
println!("Invalid ZIP: {e}");
let msg = format!("Invalid ZIP: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
Err(e) => {
println!("Cannot open ZIP file: {e}");
let msg = format!("Cannot open ZIP file: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
};
@@ -568,7 +595,9 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
}) {
Some(i) => i,
None => {
println!("ZIP did not contain a CSV file");
let msg = "ZIP did not contain a CSV file";
logger::log_error(msg).await;
println!("{}", msg);
return Ok(None);
}
};
@@ -576,23 +605,58 @@ pub async fn download_isin_lei_csv() -> anyhow::Result<Option<String>> {
let mut csv_file = match archive.by_index(idx) {
Ok(f) => f,
Err(e) => {
println!("Failed to read CSV entry: {e}");
let msg = format!("Failed to read CSV entry: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
};
let mut csv_bytes = Vec::new();
if let Err(e) = csv_file.read_to_end(&mut csv_bytes) {
println!("Failed to extract CSV: {e}");
let msg = format!("Failed to extract CSV: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
if let Err(e) = tokio::fs::write(&csv_path, &csv_bytes).await {
println!("Failed to save CSV file: {e}");
let msg = format!("Failed to save CSV file: {}", e);
logger::log_error(&msg).await;
println!("{}", msg);
return Ok(None);
}
Ok(Some(csv_path))
let msg = format!("✓ ISIN/LEI CSV extracted: {:?}", csv_path);
println!("{}", msg);
logger::log_info(&msg).await;
Ok(Some(csv_path.to_string_lossy().to_string()))
}
/// Parse GLEIF filename and convert timestamp to DDMMYYYY format
/// Example: "isin-lei-20251124T080254.csv" -> "isin-lei-24112025.csv"
fn parse_gleif_filename(filename: &str) -> String {
// Try to find pattern: isin-lei-YYYYMMDDTHHMMSS.zip/csv
if let Some(start_idx) = filename.find("isin-lei-") {
let rest = &filename[start_idx + 9..]; // After "isin-lei-"
// Extract the 8 digits (YYYYMMDD)
if rest.len() >= 8 && rest[0..8].chars().all(|c| c.is_numeric()) {
let date_part = &rest[0..8];
// date_part is YYYYMMDD, convert to DDMMYYYY
if date_part.len() == 8 {
let year = &date_part[0..4];
let month = &date_part[4..6];
let day = &date_part[6..8];
let extension = if filename.ends_with(".zip") { ".zip" } else { ".csv" };
return format!("isin-lei-{}{}{}{}", day, month, year, extension);
}
}
}
// Fallback: return original filename if parsing fails
filename.to_string()
}