Compare commits
9 Commits
v0.1.25
...
e02914516d
| Author | SHA1 | Date | |
|---|---|---|---|
| e02914516d | |||
| bf90d3ceb9 | |||
| a8ccb0521a | |||
| c90a276dca | |||
| dc4c23f9d9 | |||
| 3182d57539 | |||
| 8c1ef7f9f6 | |||
| 16020eea50 | |||
| 432a798210 |
@@ -12,16 +12,16 @@
|
||||
/// These functions are called from the main agent loop and background tasks. All network operations are asynchronous and robust to transient failures.
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::docker::container;
|
||||
use crate::docker::serverclientcomm::handle_server_message;
|
||||
use crate::hardware::HardwareInfo;
|
||||
use crate::models::{
|
||||
Acknowledgment, DockerMetricDto, DockerRegistrationDto, HeartbeatDto, IdResponse, MetricDto,
|
||||
RegistrationDto, ServerMessage,
|
||||
Acknowledgment, DockerContainer, DockerMetricDto, DockerRegistrationDto, HeartbeatDto,
|
||||
IdResponse, MetricDto, RegistrationDto, ServerMessage,
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use reqwest::{Client, StatusCode};
|
||||
use serde::Serialize;
|
||||
use std::error::Error;
|
||||
use tokio::time::sleep;
|
||||
|
||||
@@ -153,10 +153,45 @@ async fn get_server_id_by_ip(
|
||||
}
|
||||
}
|
||||
|
||||
/// Broadcasts Docker container information to the monitoring server for service discovery.
|
||||
///
|
||||
/// This function sends the current Docker container configuration to the server
|
||||
/// to register available containers and enable service monitoring. It will
|
||||
/// continuously retry until successful, making it suitable for initial
|
||||
/// registration scenarios.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `base_url` - The base URL of the monitoring server API (e.g., "https://monitoring.example.com")
|
||||
/// * `server_id` - The ID of the server to associate the containers with
|
||||
/// * `container_dto` - Mutable reference to Docker container information for broadcast
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `Ok(())` - When container information is successfully broadcasted to the server
|
||||
/// * `Err(Box<dyn Error + Send + Sync>)` - If an unrecoverable error occurs (though the function typically retries on transient failures)
|
||||
///
|
||||
/// # Behavior
|
||||
///
|
||||
/// This function operates in a retry loop with the following characteristics:
|
||||
///
|
||||
/// - **Retry Logic**: Attempts broadcast every 10 seconds until successful
|
||||
/// - **Mutation**: Modifies the `container_dto` to set the `server_id` before sending
|
||||
/// - **TLS**: Accepts invalid TLS certificates for development environments
|
||||
/// - **Logging**: Provides detailed console output about broadcast attempts and results
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This function may return an error in the following cases:
|
||||
///
|
||||
/// * **HTTP Client Creation**: Failed to create HTTP client with TLS configuration
|
||||
/// * **Network Issues**: Persistent connection failures to the backend server
|
||||
/// * **Server Errors**: Backend returns non-success HTTP status codes repeatedly
|
||||
/// * **JSON Serialization**: Cannot serialize container data (should be rare with proper DTOs)
|
||||
pub async fn broadcast_docker_containers(
|
||||
base_url: &str,
|
||||
server_id: u16,
|
||||
container_dto: &mut DockerRegistrationDto,
|
||||
container_dto: &DockerRegistrationDto,
|
||||
) -> Result<(), Box<dyn Error + Send + Sync>> {
|
||||
// First get local IP
|
||||
println!("Preparing to broadcast docker containers...");
|
||||
@@ -166,8 +201,8 @@ pub async fn broadcast_docker_containers(
|
||||
.build()?;
|
||||
|
||||
// Prepare registration data
|
||||
let container_dto = container_dto;
|
||||
container_dto.server_id = server_id;
|
||||
let mut broadcast_data = container_dto.clone();
|
||||
broadcast_data.server_id = server_id;
|
||||
|
||||
// Try to register (will retry on failure)
|
||||
loop {
|
||||
|
||||
@@ -129,27 +129,84 @@ impl DockerManager {
|
||||
/// Collects Docker metrics for all containers
|
||||
pub async fn collect_metrics(&self) -> Result<DockerMetricDto, Box<dyn Error + Send + Sync>> {
|
||||
let containers = self.get_containers().await?;
|
||||
let (cpu_stats, net_stats, mem_stats) = stats::get_container_stats(&self.docker).await?;
|
||||
if let Some(first_container) = containers.first() {
|
||||
println!("Debug: Testing stats for container {}", first_container.id);
|
||||
let _ = self.debug_container_stats(&first_container.id).await;
|
||||
}
|
||||
|
||||
// Get stats with proper error handling
|
||||
let stats_result = stats::get_container_stats(&self.docker).await;
|
||||
let (cpu_stats, net_stats, mem_stats) = match stats_result {
|
||||
Ok(stats) => stats,
|
||||
Err(e) => {
|
||||
eprintln!("Warning: Failed to get container stats: {}", e);
|
||||
// Return empty stats instead of failing completely
|
||||
(Vec::new(), Vec::new(), Vec::new())
|
||||
}
|
||||
};
|
||||
|
||||
println!(
|
||||
"Debug: Found {} containers, {} CPU stats, {} network stats, {} memory stats",
|
||||
containers.len(),
|
||||
cpu_stats.len(),
|
||||
net_stats.len(),
|
||||
mem_stats.len()
|
||||
);
|
||||
|
||||
let container_infos_total: Vec<_> = containers
|
||||
.into_iter()
|
||||
.map(|container| {
|
||||
// Use short ID for matching (first 12 chars)
|
||||
let container_short_id = if container.id.len() > 12 {
|
||||
&container.id[..12]
|
||||
} else {
|
||||
&container.id
|
||||
};
|
||||
|
||||
let cpu = cpu_stats
|
||||
.iter()
|
||||
.find(|c| c.container_id == Some(container.id.clone()))
|
||||
.find(|c| {
|
||||
c.container_id
|
||||
.as_ref()
|
||||
.map(|id| id.starts_with(container_short_id))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.cloned();
|
||||
|
||||
let network = net_stats
|
||||
.iter()
|
||||
.find(|n| n.container_id == Some(container.id.clone()))
|
||||
.find(|n| {
|
||||
n.container_id
|
||||
.as_ref()
|
||||
.map(|id| id.starts_with(container_short_id))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.cloned();
|
||||
|
||||
let ram = mem_stats
|
||||
.iter()
|
||||
.find(|m| m.container_id == Some(container.id.clone()))
|
||||
.find(|m| {
|
||||
m.container_id
|
||||
.as_ref()
|
||||
.map(|id| id.starts_with(container_short_id))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.cloned();
|
||||
|
||||
// Debug output for this container
|
||||
if cpu.is_none() || network.is_none() || ram.is_none() {
|
||||
println!(
|
||||
"Debug: Container {} - CPU: {:?}, Network: {:?}, RAM: {:?}",
|
||||
container_short_id,
|
||||
cpu.is_some(),
|
||||
network.is_some(),
|
||||
ram.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
DockerContainerInfo {
|
||||
container: Some(container),
|
||||
status: None, // Status can be fetched if needed
|
||||
status: None,
|
||||
cpu,
|
||||
network,
|
||||
ram,
|
||||
@@ -159,43 +216,57 @@ impl DockerManager {
|
||||
|
||||
let container_infos: Vec<DockerCollectMetricDto> = container_infos_total
|
||||
.into_iter()
|
||||
.map(|info| DockerCollectMetricDto {
|
||||
id: Some(info.container.unwrap().id).unwrap_or("".to_string()),
|
||||
cpu: info
|
||||
.cpu
|
||||
.unwrap()
|
||||
.cpu_usage_percent
|
||||
.map(|load| DockerContainerCpuDto {
|
||||
cpu_load: Some(load),
|
||||
})
|
||||
.unwrap_or(DockerContainerCpuDto { cpu_load: None }),
|
||||
ram: info
|
||||
.ram
|
||||
.unwrap()
|
||||
.memory_usage_percent
|
||||
.map(|load| DockerContainerRamDto {
|
||||
cpu_load: Some(load),
|
||||
})
|
||||
.unwrap_or(DockerContainerRamDto { cpu_load: None }),
|
||||
network: DockerContainerNetworkDto {
|
||||
net_in: info
|
||||
.network
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.rx_bytes
|
||||
.map(|bytes| bytes as f64)
|
||||
.or(Some(0.0)),
|
||||
net_out: info
|
||||
.network
|
||||
.unwrap()
|
||||
.tx_bytes
|
||||
.map(|bytes| bytes as f64)
|
||||
.or(Some(0.0)),
|
||||
},
|
||||
.filter_map(|info| {
|
||||
let container = match info.container {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
eprintln!("Warning: Container info missing container data, skipping");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
// Safely handle CPU data with defaults
|
||||
let cpu_dto = if let Some(cpu) = info.cpu {
|
||||
DockerContainerCpuDto {
|
||||
cpu_load: cpu.cpu_usage_percent,
|
||||
}
|
||||
} else {
|
||||
DockerContainerCpuDto { cpu_load: None }
|
||||
};
|
||||
|
||||
// Safely handle RAM data with defaults
|
||||
let ram_dto = if let Some(ram) = info.ram {
|
||||
DockerContainerRamDto {
|
||||
ram_load: ram.memory_usage_percent,
|
||||
}
|
||||
} else {
|
||||
DockerContainerRamDto { ram_load: None }
|
||||
};
|
||||
|
||||
// Safely handle network data with defaults
|
||||
let network_dto = if let Some(net) = info.network {
|
||||
DockerContainerNetworkDto {
|
||||
net_in: net.rx_bytes.map(|bytes| bytes as f64),
|
||||
net_out: net.tx_bytes.map(|bytes| bytes as f64),
|
||||
}
|
||||
} else {
|
||||
DockerContainerNetworkDto {
|
||||
net_in: None,
|
||||
net_out: None,
|
||||
}
|
||||
};
|
||||
|
||||
Some(DockerCollectMetricDto {
|
||||
id: container.id,
|
||||
cpu: cpu_dto,
|
||||
ram: ram_dto,
|
||||
network: network_dto,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
let dto = DockerMetricDto {
|
||||
server_id: 0,
|
||||
server_id: 0, // This should be set by the caller
|
||||
containers: serde_json::to_string(&container_infos)?,
|
||||
};
|
||||
|
||||
@@ -207,13 +278,48 @@ impl DockerManager {
|
||||
) -> Result<DockerRegistrationDto, Box<dyn Error + Send + Sync>> {
|
||||
let containers = self.get_containers().await?;
|
||||
let dto = DockerRegistrationDto {
|
||||
server_id: 0,
|
||||
//container_count,
|
||||
containers: serde_json::to_string(&containers)?,
|
||||
server_id: 0, // This will be set by the caller
|
||||
containers, // Fallback to empty array
|
||||
};
|
||||
|
||||
Ok(dto)
|
||||
}
|
||||
|
||||
/// Debug function to check stats collection for a specific container
|
||||
pub async fn debug_container_stats(
|
||||
&self,
|
||||
container_id: &str,
|
||||
) -> Result<(), Box<dyn Error + Send + Sync>> {
|
||||
println!("=== DEBUG STATS FOR CONTAINER {} ===", container_id);
|
||||
|
||||
let (cpu_info, net_info, mem_info) =
|
||||
stats::get_single_container_stats(&self.docker, container_id).await?;
|
||||
|
||||
println!("CPU Info: {:?}", cpu_info);
|
||||
println!("Network Info: {:?}", net_info);
|
||||
println!("Memory Info: {:?}", mem_info);
|
||||
|
||||
// Also try the individual stats functions
|
||||
println!("--- Individual CPU Stats ---");
|
||||
match stats::cpu::get_single_container_cpu_stats(&self.docker, container_id).await {
|
||||
Ok(cpu) => println!("CPU: {:?}", cpu),
|
||||
Err(e) => println!("CPU Error: {}", e),
|
||||
}
|
||||
|
||||
println!("--- Individual Network Stats ---");
|
||||
match stats::network::get_single_container_network_stats(&self.docker, container_id).await {
|
||||
Ok(net) => println!("Network: {:?}", net),
|
||||
Err(e) => println!("Network Error: {}", e),
|
||||
}
|
||||
|
||||
println!("--- Individual Memory Stats ---");
|
||||
match stats::ram::get_single_container_memory_stats(&self.docker, container_id).await {
|
||||
Ok(mem) => println!("Memory: {:?}", mem),
|
||||
Err(e) => println!("Memory Error: {}", e),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Keep these as utility functions if needed, but they should use DockerManager internally
|
||||
|
||||
@@ -31,10 +31,8 @@ pub mod hardware;
|
||||
pub mod metrics;
|
||||
pub mod models;
|
||||
|
||||
use bollard::Docker;
|
||||
use std::env;
|
||||
use std::error::Error;
|
||||
use std::sync::Arc;
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
/// Awaits a spawned asynchronous task and flattens its nested `Result` type.
|
||||
@@ -119,7 +117,7 @@ async fn main() -> Result<(), Box<dyn Error + Send + Sync>> {
|
||||
models::DockerRegistrationDto {
|
||||
server_id: 0,
|
||||
//container_count: 0, --- IGNORE ---
|
||||
containers: "[]".to_string(),
|
||||
containers: Vec::new(),
|
||||
}
|
||||
};
|
||||
let _ =
|
||||
@@ -153,7 +151,13 @@ async fn main() -> Result<(), Box<dyn Error + Send + Sync>> {
|
||||
let docker_manager = docker_manager.as_ref().cloned().unwrap();
|
||||
async move {
|
||||
let mut collector = metrics::Collector::new(server_id, ip, docker_manager);
|
||||
collector.run(&server_url).await
|
||||
if let Err(e) = collector.run(&server_url).await {
|
||||
eprintln!("Metrics collection error: {}", e);
|
||||
// Don't panic, just return the error
|
||||
Err(e)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -182,12 +182,10 @@ pub struct Acknowledgment {
|
||||
/// - `image`: Docker image name (string)
|
||||
/// - `Name`: Container name (string)
|
||||
/// - `Status`: Container status ("running", "stopped", etc.)
|
||||
/// - `_net_in`: Network receive rate in **bytes per second (B/s)**
|
||||
/// - `_net_out`: Network transmit rate in **bytes per second (B/s)**
|
||||
/// - `_cpu_load`: CPU usage as a percentage (**0.0–100.0**)
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct DockerRegistrationDto {
|
||||
/// Unique server identifier (integer)
|
||||
#[serde(rename = "Server_id")]
|
||||
pub server_id: u16,
|
||||
/// Number of currently running containers
|
||||
// pub container_count: usize, --- IGNORE ---
|
||||
@@ -200,7 +198,8 @@ pub struct DockerRegistrationDto {
|
||||
/// id: unique container ID (first 12 hex digits)
|
||||
/// image: docker image name
|
||||
/// name: container name
|
||||
pub containers: String, // Vec<DockerContainer>,
|
||||
#[serde(rename = "Containers")]
|
||||
pub containers: Vec<DockerContainer>, // Vec<DockerContainer>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
@@ -238,7 +237,7 @@ pub struct DockerContainerCpuDto {
|
||||
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct DockerContainerRamDto {
|
||||
pub cpu_load: Option<f64>,
|
||||
pub ram_load: Option<f64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
@@ -260,6 +259,8 @@ pub struct DockerContainerInfo {
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct DockerContainer {
|
||||
pub id: String,
|
||||
#[serde(default)]
|
||||
pub image: Option<String>,
|
||||
#[serde(default)]
|
||||
pub name: Option<String>,
|
||||
}
|
||||
|
||||
44
docker-compose.example.yaml
Normal file
44
docker-compose.example.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
networks:
|
||||
watcher-network:
|
||||
driver: bridge
|
||||
|
||||
services:
|
||||
watcher:
|
||||
image: git.triggermeelmo.com/watcher/watcher-server:v0.1.11
|
||||
container_name: watcher
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 200M
|
||||
restart: unless-stopped
|
||||
env_file: .env
|
||||
ports:
|
||||
- "5000:5000"
|
||||
volumes:
|
||||
- ./watcher-volumes/data:/app/persistence
|
||||
- ./watcher-volumes/dumps:/app/wwwroot/downloads/sqlite
|
||||
- ./watcher-volumes/logs:/app/logs
|
||||
|
||||
watcher-agent:
|
||||
image: git.triggermeelmo.com/donpat1to/watcher-agent:v0.1.28
|
||||
container_name: watcher-agent
|
||||
restart: always
|
||||
privileged: true # Grants full hardware access (use with caution)
|
||||
env_file: .env
|
||||
pid: "host"
|
||||
volumes:
|
||||
# Mount critical system paths for hardware monitoring
|
||||
- /sys:/sys:ro # CPU/GPU temps, sensors
|
||||
- /proc:/proc # Process/CPU stats
|
||||
- /dev:/dev:ro # Disk/GPU device access
|
||||
- /var/run/docker.sock:/var/run/docker.sock # Docker API access
|
||||
- /:/root:ro # Access to for df-command
|
||||
# Application volumes
|
||||
- ./config:/app/config:ro
|
||||
- ./logs:/app/logs
|
||||
network_mode: host # Uses host network (for correct IP/interface detection)
|
||||
healthcheck:
|
||||
test: [ "CMD", "/usr/local/bin/WatcherAgent", "healthcheck" ]
|
||||
interval: 30s
|
||||
timeout: 3s
|
||||
retries: 3
|
||||
@@ -1,23 +1,20 @@
|
||||
watcher-agent:
|
||||
image: git.triggermeelmo.com/donpat1to/watcher-agent:development
|
||||
container_name: watcher-agent
|
||||
restart: always
|
||||
privileged: true # Grants full hardware access (use with caution)
|
||||
networks:
|
||||
watcher-network:
|
||||
driver: bridge
|
||||
|
||||
services:
|
||||
watcher:
|
||||
image: git.triggermeelmo.com/watcher/watcher-server:v0.1.11
|
||||
container_name: watcher
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 200M
|
||||
restart: unless-stopped
|
||||
env_file: .env
|
||||
pid: "host"
|
||||
ports:
|
||||
- "5000:5000"
|
||||
volumes:
|
||||
# Mount critical system paths for hardware monitoring
|
||||
- /sys:/sys:ro # CPU/GPU temps, sensors
|
||||
- /proc:/proc # Process/CPU stats
|
||||
- /dev:/dev:ro # Disk/GPU device access
|
||||
- /var/run/docker.sock:/var/run/docker.sock # Docker API access
|
||||
- /:/root:ro # Access to for df-command
|
||||
# Application volumes
|
||||
- ./config:/app/config:ro
|
||||
- ./logs:/app/logs
|
||||
network_mode: host # Uses host network (for correct IP/interface detection)
|
||||
healthcheck:
|
||||
test: ["CMD", "/usr/local/bin/WatcherAgent", "healthcheck"]
|
||||
interval: 30s
|
||||
timeout: 3s
|
||||
retries: 3
|
||||
- ./watcher-volumes/data:/app/persistence
|
||||
- ./watcher-volumes/dumps:/app/wwwroot/downloads/sqlite
|
||||
- ./watcher-volumes/logs:/app/logs
|
||||
|
||||
Reference in New Issue
Block a user