// src/util/integrity.rs //! Content integrity and state lifecycle management module //! //! Features: //! - File and directory hashing (SHA-256) //! - Hash validation against content references //! - State invalidation based on time or validation failures //! - 3-stage data lifecycle: cache → data → storage //! - Inline vs. external hash storage based on size //! - Cascade invalidation when dependencies fail validation use anyhow::{Context, Result}; use chrono::{DateTime, Duration, Utc}; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::collections::{HashMap, HashSet}; use std::fs; use std::io::{BufReader, Read}; use std::path::{Path, PathBuf}; use tokio::fs as async_fs; use tokio::io::AsyncWriteExt; // ============================================================================ // CONSTANTS & CONFIGURATION // ============================================================================ /// Maximum hash size (in bytes) to store inline in state.jsonl /// Hashes larger than this will be stored in separate files const INLINE_HASH_THRESHOLD: usize = 1024; /// Directory for storing external hash files const HASH_STORAGE_DIR: &str = ".integrity_hashes"; /// File extension for external hash files const HASH_FILE_EXT: &str = ".hash"; // ============================================================================ // DATA STRUCTURES // ============================================================================ /// Represents a content reference that can be hashed #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(tag = "type", rename_all = "lowercase")] pub enum ContentReference { /// Single file reference File { path: PathBuf }, /// Directory reference (includes all files recursively) Directory { path: PathBuf, /// Optional: specific files/patterns to include include_patterns: Option>, /// Optional: files/patterns to exclude exclude_patterns: Option>, }, /// Multiple files/directories combined Composite { references: Vec, }, } /// Storage location for hash data #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(tag = "storage", rename_all = "lowercase")] pub enum HashStorage { /// Hash stored directly in state.jsonl Inline { hash: String }, /// Hash stored in external file External { hash_file: PathBuf }, } /// Data lifecycle stage #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)] #[serde(rename_all = "lowercase")] pub enum DataStage { /// Temporary/staged data (fast-changing, short-lived) Cache, /// Processed data (intermediate results, medium-lived) Data, /// Final storage (long-term, stable data) Storage, } impl DataStage { /// Get default TTL (time-to-live) for this stage pub fn default_ttl(&self) -> Duration { match self { DataStage::Cache => Duration::hours(24), // 1 day DataStage::Data => Duration::days(7), // 1 week DataStage::Storage => Duration::days(365), // 1 year } } /// Get suggested revalidation interval for this stage pub fn revalidation_interval(&self) -> Duration { match self { DataStage::Cache => Duration::hours(6), // Every 6 hours DataStage::Data => Duration::days(1), // Daily DataStage::Storage => Duration::days(30), // Monthly } } } /// Enhanced state entry with content integrity tracking #[derive(Debug, Clone, Serialize, Deserialize)] pub struct StateEntry { /// Step/function name pub step_name: String, /// Whether this step is completed pub completed: bool, /// Completion timestamp pub completed_at: Option>, /// Content reference for validation pub content_reference: Option, /// Hash of the content pub content_hash: Option, /// Data lifecycle stage pub data_stage: Option, /// Custom TTL override (if None, uses stage default) pub ttl_override: Option, /// Last validation timestamp pub last_validated_at: Option>, /// Validation status pub validation_status: ValidationStatus, /// Dependencies (other steps that must be valid for this to remain valid) pub dependencies: Vec, } /// Validation status of a state entry #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "lowercase")] pub enum ValidationStatus { /// Not yet validated Unknown, /// Validated and content matches hash Valid, /// Validation failed (hash mismatch or content missing) Invalid { reason: String }, /// Expired (beyond TTL) Expired, /// Invalidated due to dependency failure DependencyFailed { failed_dependency: String }, } // ============================================================================ // HASH COMPUTATION // ============================================================================ /// Hash a single file using SHA-256 pub fn hash_file>(path: P) -> Result { let path = path.as_ref(); let file = fs::File::open(path) .with_context(|| format!("Failed to open file: {}", path.display()))?; let mut reader = BufReader::new(file); let mut hasher = Sha256::new(); let mut buffer = [0u8; 8192]; loop { let bytes_read = reader.read(&mut buffer)?; if bytes_read == 0 { break; } hasher.update(&buffer[..bytes_read]); } Ok(format!("{:x}", hasher.finalize())) } /// Hash a directory recursively /// Returns a combined hash of all files in sorted order pub fn hash_directory>( path: P, include_patterns: Option<&[String]>, exclude_patterns: Option<&[String]>, ) -> Result { let path = path.as_ref(); if !path.is_dir() { anyhow::bail!("Path is not a directory: {}", path.display()); } // Collect all files recursively let mut files = Vec::new(); collect_files_recursive(path, &mut files, include_patterns, exclude_patterns)?; // Sort for deterministic hashing files.sort(); if files.is_empty() { return Ok(String::from("d41d8cd98f00b204e9800998ecf8427e")); // MD5 of empty string } // Hash all files and combine let mut combined_hasher = Sha256::new(); for file_path in files { // Include relative path in hash for structure awareness let rel_path = file_path.strip_prefix(path) .unwrap_or(&file_path) .to_string_lossy(); combined_hasher.update(rel_path.as_bytes()); // Hash file content let file_hash = hash_file(&file_path)?; combined_hasher.update(file_hash.as_bytes()); } Ok(format!("{:x}", combined_hasher.finalize())) } /// Collect files recursively with pattern filtering fn collect_files_recursive( dir: &Path, files: &mut Vec, include_patterns: Option<&[String]>, exclude_patterns: Option<&[String]>, ) -> Result<()> { if !dir.is_dir() { return Ok(()); } for entry in fs::read_dir(dir)? { let entry = entry?; let path = entry.path(); // Skip hidden files and directories if let Some(name) = path.file_name() { if name.to_string_lossy().starts_with('.') { continue; } } if path.is_dir() { collect_files_recursive(&path, files, include_patterns, exclude_patterns)?; } else if path.is_file() { // Apply pattern filters if should_include_file(&path, include_patterns, exclude_patterns) { files.push(path); } } } Ok(()) } /// Check if a file should be included based on patterns fn should_include_file( path: &Path, include_patterns: Option<&[String]>, exclude_patterns: Option<&[String]>, ) -> bool { let path_str = path.to_string_lossy(); // Check exclude patterns first if let Some(excludes) = exclude_patterns { for pattern in excludes { if path_str.contains(pattern) || matches_glob(path, pattern) { return false; } } } // Check include patterns if let Some(includes) = include_patterns { for pattern in includes { if path_str.contains(pattern) || matches_glob(path, pattern) { return true; } } return false; // If includes specified but no match } true // Include by default } /// Simple glob pattern matching (supports * and ?) fn matches_glob(path: &Path, pattern: &str) -> bool { let path_str = path.to_string_lossy(); // Convert glob to regex let regex_pattern = pattern .replace(".", "\\.") .replace("*", ".*") .replace("?", "."); if let Ok(re) = regex::Regex::new(&format!("^{}$", regex_pattern)) { re.is_match(&path_str) } else { false } } /// Hash a content reference pub fn hash_content_reference(reference: &ContentReference) -> Result { match reference { ContentReference::File { path } => { hash_file(path) } ContentReference::Directory { path, include_patterns, exclude_patterns } => { hash_directory( path, include_patterns.as_deref(), exclude_patterns.as_deref(), ) } ContentReference::Composite { references } => { let mut combined_hasher = Sha256::new(); for reference in references { let hash = hash_content_reference(reference)?; combined_hasher.update(hash.as_bytes()); } Ok(format!("{:x}", combined_hasher.finalize())) } } } // ============================================================================ // HASH STORAGE MANAGEMENT // ============================================================================ /// Determine storage method based on hash size pub fn determine_hash_storage(hash: &str, base_dir: &Path) -> HashStorage { if hash.len() <= INLINE_HASH_THRESHOLD { HashStorage::Inline { hash: hash.to_string(), } } else { let hash_id = Sha256::digest(hash.as_bytes()); let hash_filename = format!("{:x}{}", hash_id, HASH_FILE_EXT); HashStorage::External { hash_file: base_dir .join(HASH_STORAGE_DIR) .join(hash_filename), } } } /// Store hash externally if needed pub async fn store_hash( hash: &str, storage: &HashStorage, ) -> Result<()> { match storage { HashStorage::Inline { .. } => { // Nothing to do, hash is inline Ok(()) } HashStorage::External { hash_file } => { // Create directory if needed if let Some(parent) = hash_file.parent() { async_fs::create_dir_all(parent).await?; } // Write hash to file let mut file = async_fs::File::create(hash_file).await?; file.write_all(hash.as_bytes()).await?; file.flush().await?; Ok(()) } } } /// Retrieve hash from storage pub async fn retrieve_hash(storage: &HashStorage) -> Result { match storage { HashStorage::Inline { hash } => { Ok(hash.clone()) } HashStorage::External { hash_file } => { async_fs::read_to_string(hash_file) .await .with_context(|| format!("Failed to read hash file: {}", hash_file.display())) } } } // ============================================================================ // VALIDATION // ============================================================================ /// Validate a state entry's content against its hash pub async fn validate_entry(entry: &StateEntry) -> Result { // Check if completed if !entry.completed { return Ok(ValidationStatus::Unknown); } // Check TTL expiration if let Some(completed_at) = entry.completed_at { let ttl = entry.ttl_override .or_else(|| entry.data_stage.map(|s| s.default_ttl())) .unwrap_or_else(|| Duration::days(7)); let expiration = completed_at + ttl; if Utc::now() > expiration { return Ok(ValidationStatus::Expired); } } // Validate content hash if available if let (Some(reference), Some(storage)) = (&entry.content_reference, &entry.content_hash) { // Compute current hash let current_hash = match hash_content_reference(reference) { Ok(hash) => hash, Err(e) => { return Ok(ValidationStatus::Invalid { reason: format!("Failed to compute hash: {}", e), }); } }; // Retrieve stored hash let stored_hash = match retrieve_hash(storage).await { Ok(hash) => hash, Err(e) => { return Ok(ValidationStatus::Invalid { reason: format!("Failed to retrieve stored hash: {}", e), }); } }; // Compare hashes if current_hash != stored_hash { return Ok(ValidationStatus::Invalid { reason: "Hash mismatch".to_string(), }); } } Ok(ValidationStatus::Valid) } /// Validate all state entries and handle cascade invalidation pub async fn validate_all_entries( entries: &mut HashMap, ) -> Result { let mut report = ValidationReport::default(); // First pass: validate each entry independently for (name, entry) in entries.iter_mut() { let status = validate_entry(entry).await?; entry.validation_status = status.clone(); entry.last_validated_at = Some(Utc::now()); match status { ValidationStatus::Valid => report.valid_count += 1, ValidationStatus::Invalid { .. } => { report.invalid_count += 1; report.invalid_entries.push(name.clone()); } ValidationStatus::Expired => { report.expired_count += 1; report.expired_entries.push(name.clone()); } ValidationStatus::Unknown => report.unknown_count += 1, ValidationStatus::DependencyFailed { .. } => {} } } // Second pass: cascade invalidation based on dependencies let mut invalidated = HashSet::new(); for name in &report.invalid_entries { invalidated.insert(name.clone()); } loop { let mut newly_invalidated = Vec::new(); for (name, entry) in entries.iter() { if invalidated.contains(name) { continue; } // Check if any dependency is invalidated for dep in &entry.dependencies { if invalidated.contains(dep) { newly_invalidated.push((name.clone(), dep.clone())); break; } } } if newly_invalidated.is_empty() { break; } for (name, failed_dep) in newly_invalidated { invalidated.insert(name.clone()); report.cascaded_invalidations.push(name.clone()); if let Some(entry) = entries.get_mut(&name) { entry.validation_status = ValidationStatus::DependencyFailed { failed_dependency: failed_dep, }; } } } Ok(report) } /// Validation report #[derive(Debug, Default)] pub struct ValidationReport { pub valid_count: usize, pub invalid_count: usize, pub expired_count: usize, pub unknown_count: usize, pub invalid_entries: Vec, pub expired_entries: Vec, pub cascaded_invalidations: Vec, } impl ValidationReport { pub fn print_summary(&self) { println!("=== Validation Report ==="); println!("Valid: {}", self.valid_count); println!("Invalid: {}", self.invalid_count); println!("Expired: {}", self.expired_count); println!("Unknown: {}", self.unknown_count); if !self.invalid_entries.is_empty() { println!("\nInvalid entries:"); for entry in &self.invalid_entries { println!(" - {}", entry); } } if !self.expired_entries.is_empty() { println!("\nExpired entries:"); for entry in &self.expired_entries { println!(" - {}", entry); } } if !self.cascaded_invalidations.is_empty() { println!("\nCascaded invalidations:"); for entry in &self.cascaded_invalidations { println!(" - {}", entry); } } } } // ============================================================================ // STATE MANAGEMENT // ============================================================================ /// State manager for reading/writing state entries pub struct StateManager { state_path: PathBuf, base_dir: PathBuf, } impl StateManager { pub fn new>(state_path: P, base_dir: P) -> Self { Self { state_path: state_path.as_ref().to_path_buf(), base_dir: base_dir.as_ref().to_path_buf(), } } /// Load all state entries from state.jsonl pub async fn load_entries(&self) -> Result> { let mut entries = HashMap::new(); if !self.state_path.exists() { return Ok(entries); } let content = async_fs::read_to_string(&self.state_path).await?; for line in content.lines() { if line.trim().is_empty() { continue; } if let Ok(entry) = serde_json::from_str::(line) { entries.insert(entry.step_name.clone(), entry); } } Ok(entries) } /// Save all state entries to state.jsonl pub async fn save_entries(&self, entries: &HashMap) -> Result<()> { let mut lines = Vec::new(); for entry in entries.values() { let json = serde_json::to_string(entry)?; lines.push(json); } let content = lines.join("\n") + "\n"; async_fs::write(&self.state_path, content).await?; Ok(()) } /// Create or update a state entry with integrity tracking pub async fn update_entry( &self, step_name: String, content_reference: ContentReference, data_stage: DataStage, dependencies: Vec, ttl_override: Option, ) -> Result { // Compute hash let hash = hash_content_reference(&content_reference)?; // Determine storage let storage = determine_hash_storage(&hash, &self.base_dir); // Store hash if external store_hash(&hash, &storage).await?; // Create entry let entry = StateEntry { step_name: step_name.clone(), completed: true, completed_at: Some(Utc::now()), content_reference: Some(content_reference), content_hash: Some(storage), data_stage: Some(data_stage), ttl_override, last_validated_at: Some(Utc::now()), validation_status: ValidationStatus::Valid, dependencies, }; // Load existing entries let mut entries = self.load_entries().await?; // Update entry entries.insert(step_name, entry.clone()); // Save self.save_entries(&entries).await?; Ok(entry) } /// Check if a step is valid and completed pub async fn is_step_valid(&self, step_name: &str) -> Result { let entries = self.load_entries().await?; if let Some(entry) = entries.get(step_name) { let status = validate_entry(entry).await?; Ok(matches!(status, ValidationStatus::Valid)) } else { Ok(false) } } /// Invalidate a specific entry pub async fn invalidate_entry(&self, step_name: &str, reason: String) -> Result<()> { let mut entries = self.load_entries().await?; if let Some(entry) = entries.get_mut(step_name) { entry.validation_status = ValidationStatus::Invalid { reason }; entry.last_validated_at = Some(Utc::now()); } self.save_entries(&entries).await?; Ok(()) } /// Run full validation on all entries pub async fn validate_all(&self) -> Result { let mut entries = self.load_entries().await?; let report = validate_all_entries(&mut entries).await?; self.save_entries(&entries).await?; Ok(report) } } // ============================================================================ // HELPER FUNCTIONS // ============================================================================ /// Create a simple file reference pub fn file_reference>(path: P) -> ContentReference { ContentReference::File { path: path.as_ref().to_path_buf(), } } /// Create a directory reference pub fn directory_reference>( path: P, include_patterns: Option>, exclude_patterns: Option>, ) -> ContentReference { ContentReference::Directory { path: path.as_ref().to_path_buf(), include_patterns, exclude_patterns, } } /// Create a composite reference pub fn composite_reference(references: Vec) -> ContentReference { ContentReference::Composite { references } }