Files
WebScraper/src/util/integrity.rs

729 lines
22 KiB
Rust

// src/util/integrity.rs
//! Content integrity and state lifecycle management module
//!
//! Features:
//! - File and directory hashing (SHA-256)
//! - Hash validation against content references
//! - State invalidation based on time or validation failures
//! - 3-stage data lifecycle: cache → data → storage
//! - Inline vs. external hash storage based on size
//! - Cascade invalidation when dependencies fail validation
use anyhow::{Context, Result};
use chrono::{DateTime, Duration, Utc};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::{BufReader, Read};
use std::path::{Path, PathBuf};
use tokio::fs as async_fs;
use tokio::io::AsyncWriteExt;
// ============================================================================
// CONSTANTS & CONFIGURATION
// ============================================================================
/// Maximum hash size (in bytes) to store inline in state.jsonl
/// Hashes larger than this will be stored in separate files
const INLINE_HASH_THRESHOLD: usize = 1024;
/// Directory for storing external hash files
const HASH_STORAGE_DIR: &str = ".integrity_hashes";
/// File extension for external hash files
const HASH_FILE_EXT: &str = ".hash";
// ============================================================================
// DATA STRUCTURES
// ============================================================================
/// Represents a content reference that can be hashed
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum ContentReference {
/// Single file reference
File { path: PathBuf },
/// Directory reference (includes all files recursively)
Directory {
path: PathBuf,
/// Optional: specific files/patterns to include
include_patterns: Option<Vec<String>>,
/// Optional: files/patterns to exclude
exclude_patterns: Option<Vec<String>>,
},
/// Multiple files/directories combined
Composite {
references: Vec<ContentReference>,
},
}
/// Storage location for hash data
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "storage", rename_all = "lowercase")]
pub enum HashStorage {
/// Hash stored directly in state.jsonl
Inline { hash: String },
/// Hash stored in external file
External { hash_file: PathBuf },
}
/// Data lifecycle stage
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "lowercase")]
pub enum DataStage {
/// Temporary/staged data (fast-changing, short-lived)
Cache,
/// Processed data (intermediate results, medium-lived)
Data,
/// Final storage (long-term, stable data)
Storage,
}
impl DataStage {
/// Get default TTL (time-to-live) for this stage
pub fn default_ttl(&self) -> Duration {
match self {
DataStage::Cache => Duration::hours(24), // 1 day
DataStage::Data => Duration::days(7), // 1 week
DataStage::Storage => Duration::days(365), // 1 year
}
}
/// Get suggested revalidation interval for this stage
pub fn revalidation_interval(&self) -> Duration {
match self {
DataStage::Cache => Duration::hours(6), // Every 6 hours
DataStage::Data => Duration::days(1), // Daily
DataStage::Storage => Duration::days(30), // Monthly
}
}
}
/// Enhanced state entry with content integrity tracking
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StateEntry {
/// Step/function name
pub step_name: String,
/// Whether this step is completed
pub completed: bool,
/// Completion timestamp
pub completed_at: Option<DateTime<Utc>>,
/// Content reference for validation
pub content_reference: Option<ContentReference>,
/// Hash of the content
pub content_hash: Option<HashStorage>,
/// Data lifecycle stage
pub data_stage: Option<DataStage>,
/// Custom TTL override (if None, uses stage default)
pub ttl_override: Option<Duration>,
/// Last validation timestamp
pub last_validated_at: Option<DateTime<Utc>>,
/// Validation status
pub validation_status: ValidationStatus,
/// Dependencies (other steps that must be valid for this to remain valid)
pub dependencies: Vec<String>,
}
/// Validation status of a state entry
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum ValidationStatus {
/// Not yet validated
Unknown,
/// Validated and content matches hash
Valid,
/// Validation failed (hash mismatch or content missing)
Invalid { reason: String },
/// Expired (beyond TTL)
Expired,
/// Invalidated due to dependency failure
DependencyFailed { failed_dependency: String },
}
// ============================================================================
// HASH COMPUTATION
// ============================================================================
/// Hash a single file using SHA-256
pub fn hash_file<P: AsRef<Path>>(path: P) -> Result<String> {
let path = path.as_ref();
let file = fs::File::open(path)
.with_context(|| format!("Failed to open file: {}", path.display()))?;
let mut reader = BufReader::new(file);
let mut hasher = Sha256::new();
let mut buffer = [0u8; 8192];
loop {
let bytes_read = reader.read(&mut buffer)?;
if bytes_read == 0 {
break;
}
hasher.update(&buffer[..bytes_read]);
}
Ok(format!("{:x}", hasher.finalize()))
}
/// Hash a directory recursively
/// Returns a combined hash of all files in sorted order
pub fn hash_directory<P: AsRef<Path>>(
path: P,
include_patterns: Option<&[String]>,
exclude_patterns: Option<&[String]>,
) -> Result<String> {
let path = path.as_ref();
if !path.is_dir() {
anyhow::bail!("Path is not a directory: {}", path.display());
}
// Collect all files recursively
let mut files = Vec::new();
collect_files_recursive(path, &mut files, include_patterns, exclude_patterns)?;
// Sort for deterministic hashing
files.sort();
if files.is_empty() {
return Ok(String::from("d41d8cd98f00b204e9800998ecf8427e")); // MD5 of empty string
}
// Hash all files and combine
let mut combined_hasher = Sha256::new();
for file_path in files {
// Include relative path in hash for structure awareness
let rel_path = file_path.strip_prefix(path)
.unwrap_or(&file_path)
.to_string_lossy();
combined_hasher.update(rel_path.as_bytes());
// Hash file content
let file_hash = hash_file(&file_path)?;
combined_hasher.update(file_hash.as_bytes());
}
Ok(format!("{:x}", combined_hasher.finalize()))
}
/// Collect files recursively with pattern filtering
fn collect_files_recursive(
dir: &Path,
files: &mut Vec<PathBuf>,
include_patterns: Option<&[String]>,
exclude_patterns: Option<&[String]>,
) -> Result<()> {
if !dir.is_dir() {
return Ok(());
}
for entry in fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
// Skip hidden files and directories
if let Some(name) = path.file_name() {
if name.to_string_lossy().starts_with('.') {
continue;
}
}
if path.is_dir() {
collect_files_recursive(&path, files, include_patterns, exclude_patterns)?;
} else if path.is_file() {
// Apply pattern filters
if should_include_file(&path, include_patterns, exclude_patterns) {
files.push(path);
}
}
}
Ok(())
}
/// Check if a file should be included based on patterns
fn should_include_file(
path: &Path,
include_patterns: Option<&[String]>,
exclude_patterns: Option<&[String]>,
) -> bool {
let path_str = path.to_string_lossy();
// Check exclude patterns first
if let Some(excludes) = exclude_patterns {
for pattern in excludes {
if path_str.contains(pattern) || matches_glob(path, pattern) {
return false;
}
}
}
// Check include patterns
if let Some(includes) = include_patterns {
for pattern in includes {
if path_str.contains(pattern) || matches_glob(path, pattern) {
return true;
}
}
return false; // If includes specified but no match
}
true // Include by default
}
/// Simple glob pattern matching (supports * and ?)
fn matches_glob(path: &Path, pattern: &str) -> bool {
let path_str = path.to_string_lossy();
// Convert glob to regex
let regex_pattern = pattern
.replace(".", "\\.")
.replace("*", ".*")
.replace("?", ".");
if let Ok(re) = regex::Regex::new(&format!("^{}$", regex_pattern)) {
re.is_match(&path_str)
} else {
false
}
}
/// Hash a content reference
pub fn hash_content_reference(reference: &ContentReference) -> Result<String> {
match reference {
ContentReference::File { path } => {
hash_file(path)
}
ContentReference::Directory { path, include_patterns, exclude_patterns } => {
hash_directory(
path,
include_patterns.as_deref(),
exclude_patterns.as_deref(),
)
}
ContentReference::Composite { references } => {
let mut combined_hasher = Sha256::new();
for reference in references {
let hash = hash_content_reference(reference)?;
combined_hasher.update(hash.as_bytes());
}
Ok(format!("{:x}", combined_hasher.finalize()))
}
}
}
// ============================================================================
// HASH STORAGE MANAGEMENT
// ============================================================================
/// Determine storage method based on hash size
pub fn determine_hash_storage(hash: &str, base_dir: &Path) -> HashStorage {
if hash.len() <= INLINE_HASH_THRESHOLD {
HashStorage::Inline {
hash: hash.to_string(),
}
} else {
let hash_id = Sha256::digest(hash.as_bytes());
let hash_filename = format!("{:x}{}", hash_id, HASH_FILE_EXT);
HashStorage::External {
hash_file: base_dir
.join(HASH_STORAGE_DIR)
.join(hash_filename),
}
}
}
/// Store hash externally if needed
pub async fn store_hash(
hash: &str,
storage: &HashStorage,
) -> Result<()> {
match storage {
HashStorage::Inline { .. } => {
// Nothing to do, hash is inline
Ok(())
}
HashStorage::External { hash_file } => {
// Create directory if needed
if let Some(parent) = hash_file.parent() {
async_fs::create_dir_all(parent).await?;
}
// Write hash to file
let mut file = async_fs::File::create(hash_file).await?;
file.write_all(hash.as_bytes()).await?;
file.flush().await?;
Ok(())
}
}
}
/// Retrieve hash from storage
pub async fn retrieve_hash(storage: &HashStorage) -> Result<String> {
match storage {
HashStorage::Inline { hash } => {
Ok(hash.clone())
}
HashStorage::External { hash_file } => {
async_fs::read_to_string(hash_file)
.await
.with_context(|| format!("Failed to read hash file: {}", hash_file.display()))
}
}
}
// ============================================================================
// VALIDATION
// ============================================================================
/// Validate a state entry's content against its hash
pub async fn validate_entry(entry: &StateEntry) -> Result<ValidationStatus> {
// Check if completed
if !entry.completed {
return Ok(ValidationStatus::Unknown);
}
// Check TTL expiration
if let Some(completed_at) = entry.completed_at {
let ttl = entry.ttl_override
.or_else(|| entry.data_stage.map(|s| s.default_ttl()))
.unwrap_or_else(|| Duration::days(7));
let expiration = completed_at + ttl;
if Utc::now() > expiration {
return Ok(ValidationStatus::Expired);
}
}
// Validate content hash if available
if let (Some(reference), Some(storage)) = (&entry.content_reference, &entry.content_hash) {
// Compute current hash
let current_hash = match hash_content_reference(reference) {
Ok(hash) => hash,
Err(e) => {
return Ok(ValidationStatus::Invalid {
reason: format!("Failed to compute hash: {}", e),
});
}
};
// Retrieve stored hash
let stored_hash = match retrieve_hash(storage).await {
Ok(hash) => hash,
Err(e) => {
return Ok(ValidationStatus::Invalid {
reason: format!("Failed to retrieve stored hash: {}", e),
});
}
};
// Compare hashes
if current_hash != stored_hash {
return Ok(ValidationStatus::Invalid {
reason: "Hash mismatch".to_string(),
});
}
}
Ok(ValidationStatus::Valid)
}
/// Validate all state entries and handle cascade invalidation
pub async fn validate_all_entries(
entries: &mut HashMap<String, StateEntry>,
) -> Result<ValidationReport> {
let mut report = ValidationReport::default();
// First pass: validate each entry independently
for (name, entry) in entries.iter_mut() {
let status = validate_entry(entry).await?;
entry.validation_status = status.clone();
entry.last_validated_at = Some(Utc::now());
match status {
ValidationStatus::Valid => report.valid_count += 1,
ValidationStatus::Invalid { .. } => {
report.invalid_count += 1;
report.invalid_entries.push(name.clone());
}
ValidationStatus::Expired => {
report.expired_count += 1;
report.expired_entries.push(name.clone());
}
ValidationStatus::Unknown => report.unknown_count += 1,
ValidationStatus::DependencyFailed { .. } => {}
}
}
// Second pass: cascade invalidation based on dependencies
let mut invalidated = HashSet::new();
for name in &report.invalid_entries {
invalidated.insert(name.clone());
}
loop {
let mut newly_invalidated = Vec::new();
for (name, entry) in entries.iter() {
if invalidated.contains(name) {
continue;
}
// Check if any dependency is invalidated
for dep in &entry.dependencies {
if invalidated.contains(dep) {
newly_invalidated.push((name.clone(), dep.clone()));
break;
}
}
}
if newly_invalidated.is_empty() {
break;
}
for (name, failed_dep) in newly_invalidated {
invalidated.insert(name.clone());
report.cascaded_invalidations.push(name.clone());
if let Some(entry) = entries.get_mut(&name) {
entry.validation_status = ValidationStatus::DependencyFailed {
failed_dependency: failed_dep,
};
}
}
}
Ok(report)
}
/// Validation report
#[derive(Debug, Default)]
pub struct ValidationReport {
pub valid_count: usize,
pub invalid_count: usize,
pub expired_count: usize,
pub unknown_count: usize,
pub invalid_entries: Vec<String>,
pub expired_entries: Vec<String>,
pub cascaded_invalidations: Vec<String>,
}
impl ValidationReport {
pub fn print_summary(&self) {
println!("=== Validation Report ===");
println!("Valid: {}", self.valid_count);
println!("Invalid: {}", self.invalid_count);
println!("Expired: {}", self.expired_count);
println!("Unknown: {}", self.unknown_count);
if !self.invalid_entries.is_empty() {
println!("\nInvalid entries:");
for entry in &self.invalid_entries {
println!(" - {}", entry);
}
}
if !self.expired_entries.is_empty() {
println!("\nExpired entries:");
for entry in &self.expired_entries {
println!(" - {}", entry);
}
}
if !self.cascaded_invalidations.is_empty() {
println!("\nCascaded invalidations:");
for entry in &self.cascaded_invalidations {
println!(" - {}", entry);
}
}
}
}
// ============================================================================
// STATE MANAGEMENT
// ============================================================================
/// State manager for reading/writing state entries
pub struct StateManager {
state_path: PathBuf,
base_dir: PathBuf,
}
impl StateManager {
pub fn new<P: AsRef<Path>>(state_path: P, base_dir: P) -> Self {
Self {
state_path: state_path.as_ref().to_path_buf(),
base_dir: base_dir.as_ref().to_path_buf(),
}
}
/// Load all state entries from state.jsonl
pub async fn load_entries(&self) -> Result<HashMap<String, StateEntry>> {
let mut entries = HashMap::new();
if !self.state_path.exists() {
return Ok(entries);
}
let content = async_fs::read_to_string(&self.state_path).await?;
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
if let Ok(entry) = serde_json::from_str::<StateEntry>(line) {
entries.insert(entry.step_name.clone(), entry);
}
}
Ok(entries)
}
/// Save all state entries to state.jsonl
pub async fn save_entries(&self, entries: &HashMap<String, StateEntry>) -> Result<()> {
let mut lines = Vec::new();
for entry in entries.values() {
let json = serde_json::to_string(entry)?;
lines.push(json);
}
let content = lines.join("\n") + "\n";
async_fs::write(&self.state_path, content).await?;
Ok(())
}
/// Create or update a state entry with integrity tracking
pub async fn update_entry(
&self,
step_name: String,
content_reference: ContentReference,
data_stage: DataStage,
dependencies: Vec<String>,
ttl_override: Option<Duration>,
) -> Result<StateEntry> {
// Compute hash
let hash = hash_content_reference(&content_reference)?;
// Determine storage
let storage = determine_hash_storage(&hash, &self.base_dir);
// Store hash if external
store_hash(&hash, &storage).await?;
// Create entry
let entry = StateEntry {
step_name: step_name.clone(),
completed: true,
completed_at: Some(Utc::now()),
content_reference: Some(content_reference),
content_hash: Some(storage),
data_stage: Some(data_stage),
ttl_override,
last_validated_at: Some(Utc::now()),
validation_status: ValidationStatus::Valid,
dependencies,
};
// Load existing entries
let mut entries = self.load_entries().await?;
// Update entry
entries.insert(step_name, entry.clone());
// Save
self.save_entries(&entries).await?;
Ok(entry)
}
/// Check if a step is valid and completed
pub async fn is_step_valid(&self, step_name: &str) -> Result<bool> {
let entries = self.load_entries().await?;
if let Some(entry) = entries.get(step_name) {
let status = validate_entry(entry).await?;
Ok(matches!(status, ValidationStatus::Valid))
} else {
Ok(false)
}
}
/// Invalidate a specific entry
pub async fn invalidate_entry(&self, step_name: &str, reason: String) -> Result<()> {
let mut entries = self.load_entries().await?;
if let Some(entry) = entries.get_mut(step_name) {
entry.validation_status = ValidationStatus::Invalid { reason };
entry.last_validated_at = Some(Utc::now());
}
self.save_entries(&entries).await?;
Ok(())
}
/// Run full validation on all entries
pub async fn validate_all(&self) -> Result<ValidationReport> {
let mut entries = self.load_entries().await?;
let report = validate_all_entries(&mut entries).await?;
self.save_entries(&entries).await?;
Ok(report)
}
}
// ============================================================================
// HELPER FUNCTIONS
// ============================================================================
/// Create a simple file reference
pub fn file_reference<P: AsRef<Path>>(path: P) -> ContentReference {
ContentReference::File {
path: path.as_ref().to_path_buf(),
}
}
/// Create a directory reference
pub fn directory_reference<P: AsRef<Path>>(
path: P,
include_patterns: Option<Vec<String>>,
exclude_patterns: Option<Vec<String>>,
) -> ContentReference {
ContentReference::Directory {
path: path.as_ref().to_path_buf(),
include_patterns,
exclude_patterns,
}
}
/// Create a composite reference
pub fn composite_reference(references: Vec<ContentReference>) -> ContentReference {
ContentReference::Composite { references }
}