watcheragent/WatcherAgent/src/hardware/gpu.rs

use anyhow::Result;
use nvml_wrapper::Nvml;
use std::error::Error;

/// # GPU Hardware Module
///
/// This module provides GPU information collection for WatcherAgent, including load, temperature, and VRAM statistics.
///
/// ## Responsibilities
/// - **GPU Detection:** Identifies GPU model and capabilities.
/// - **Metric Collection:** Queries GPU load, temperature, and VRAM usage using NVML (NVIDIA only).
/// - **Error Handling:** Graceful fallback if GPU or NVML is unavailable.
///
/// ## Units
/// - `current_load`: GPU usage as a percentage (**0.0–100.0**)
/// - `current_temp`: GPU temperature in **degrees Celsius (°C)**
/// - `vram_total`: Total VRAM in **bytes**
/// - `vram_used`: Used VRAM in **bytes**
///
/// GPU statistics for the host system.
///
/// # Fields
/// - `name`: GPU model name (string)
/// - `current_load`: GPU usage as a percentage (**0.0–100.0**)
/// - `current_temp`: GPU temperature in **degrees Celsius (°C)**
/// - `vram_total`: Total VRAM in **bytes**
/// - `vram_used`: Used VRAM in **bytes**
#[derive(Debug)]
pub struct GpuInfo {
    pub name: Option<String>,
    pub current_load: Option<f64>,
    pub current_temp: Option<f64>,
    pub vram_total: Option<f64>,
    pub vram_used: Option<f64>,
}

/// Collects GPU information (load, temperature, VRAM) using NVML.
///
/// This function attempts to query the first NVIDIA GPU using NVML. If unavailable, it returns a fallback with only the detected GPU name.
///
/// # Returns
/// * `Result<GpuInfo, Box<dyn Error + Send + Sync>>` - GPU statistics or fallback if unavailable.
pub async fn get_gpu_info() -> Result<GpuInfo, Box<dyn Error + Send + Sync>> {
    match get_gpu_metrics() {
        Ok((gpu_temp, gpu_load, vram_used, vram_total)) => {
            let gpu_name = detect_gpu_name();
            Ok(GpuInfo {
                name: Some(gpu_name),
                current_load: Some(gpu_load),
                current_temp: Some(gpu_temp),
                vram_total: Some(vram_total),
                vram_used: Some(vram_used),
            })
        }
        Err(e) => {
            // Graceful fallback: log error, return empty/None values
            eprintln!("GPU info unavailable: {e}");
            Ok(GpuInfo {
                name: Some(detect_gpu_name()),
                current_load: None,
                current_temp: None,
                vram_total: None,
                vram_used: None,
            })
        }
    }
}

/// Queries NVML for GPU metrics: temperature, load, VRAM used/total.
///
/// # Returns
/// * `Result<(f64, f64, f64, f64), Box<dyn Error + Send + Sync>>` - Tuple of (temperature °C, load %, VRAM used bytes, VRAM total bytes).
pub fn get_gpu_metrics() -> Result<(f64, f64, f64, f64), Box<dyn Error + Send + Sync>> {
    let nvml = Nvml::init();
    if let Ok(nvml) = nvml {
        if let Ok(device) = nvml.device_by_index(0) {
            let temp = device
                .temperature(nvml_wrapper::enum_wrappers::device::TemperatureSensor::Gpu)
                .unwrap_or(0) as f64;
            let load = device
                .utilization_rates()
                .map(|u| u.gpu as f64)
                .unwrap_or(0.0);
            let mem = device.memory_info().ok();
            let used = mem.clone().map(|m| m.used as f64).unwrap_or(0.0);
            let total = mem.map(|m| m.total as f64).unwrap_or(0.0);
            Ok((temp, load, used, total))
        } else {
            Err(anyhow::anyhow!("No NVIDIA GPU found").into())
        }
    } else {
        Err(anyhow::anyhow!("Failed to initialize NVML").into())
    }
}

fn detect_gpu_name() -> String {
    try_nvml_gpu_name()
        .or_else(fallback_gpu_name)
        .unwrap_or_else(|| "Unknown GPU".to_string())
}

fn try_nvml_gpu_name() -> Option<String> {
    let nvml = Nvml::init().ok()?;
    let device = nvml.device_by_index(0).ok()?;
    device.name().ok().map(|s| s.to_string())
}

fn fallback_gpu_name() -> Option<String> {
    #[cfg(target_os = "linux")]
    {
        let output = std::process::Command::new("lshw")
            .args(&["-C", "display"])
            .output()
            .ok()?;
        String::from_utf8_lossy(&output.stdout)
            .lines()
            .find(|l| l.contains("product:"))
            .map(|l| l.trim().replace("product:", "").trim().to_string())
    }

    #[cfg(target_os = "windows")]
    {
        let output = std::process::Command::new("wmic")
            .args(["path", "win32_VideoController", "get", "name"])
            .output()
            .ok()?;
        String::from_utf8_lossy(&output.stdout)
            .lines()
            .skip(1) // Skip header
            .find(|s| !s.trim().is_empty())
            .map(|s| s.trim().to_string())
    }

    #[cfg(not(any(target_os = "linux", target_os = "windows")))]
    {
        None
    }
}