mirror of
https://github.com/marcoallegretti/karapace.git
synced 2026-03-27 14:03:09 +00:00
392 lines
12 KiB
Rust
392 lines
12 KiB
Rust
|
|
#![allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
|
||
|
|
//! Real crash recovery tests using fork + SIGKILL.
|
||
|
|
//!
|
||
|
|
//! These tests fork a child process that runs Karapace operations in a tight
|
||
|
|
//! loop, kill it mid-flight with SIGKILL, then verify the store is recoverable
|
||
|
|
//! and consistent in the parent.
|
||
|
|
//!
|
||
|
|
//! This validates that:
|
||
|
|
//! - WAL recovery cleans up incomplete operations
|
||
|
|
//! - No partially created environment directories survive
|
||
|
|
//! - No corrupted metadata remains
|
||
|
|
//! - Store integrity check passes after recovery
|
||
|
|
//! - Lock state is released (flock auto-released on process death)
|
||
|
|
|
||
|
|
use karapace_core::{Engine, StoreLock};
|
||
|
|
use karapace_store::StoreLayout;
|
||
|
|
use std::fs;
|
||
|
|
use std::path::Path;
|
||
|
|
|
||
|
|
fn write_manifest(dir: &Path, content: &str) -> std::path::PathBuf {
|
||
|
|
let path = dir.join("karapace.toml");
|
||
|
|
fs::write(&path, content).unwrap();
|
||
|
|
path
|
||
|
|
}
|
||
|
|
|
||
|
|
fn mock_manifest(packages: &[&str]) -> String {
|
||
|
|
format!(
|
||
|
|
r#"
|
||
|
|
manifest_version = 1
|
||
|
|
[base]
|
||
|
|
image = "rolling"
|
||
|
|
[system]
|
||
|
|
packages = [{}]
|
||
|
|
[runtime]
|
||
|
|
backend = "mock"
|
||
|
|
"#,
|
||
|
|
packages
|
||
|
|
.iter()
|
||
|
|
.map(|p| format!("\"{p}\""))
|
||
|
|
.collect::<Vec<_>>()
|
||
|
|
.join(", ")
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Verify that the store is in a consistent state after crash recovery.
|
||
|
|
fn verify_store_healthy(store_path: &Path) {
|
||
|
|
// Creating a new Engine triggers WAL recovery
|
||
|
|
let engine = Engine::new(store_path);
|
||
|
|
let layout = StoreLayout::new(store_path);
|
||
|
|
|
||
|
|
// WAL must be empty after recovery
|
||
|
|
let wal = karapace_store::WriteAheadLog::new(&layout);
|
||
|
|
let incomplete = wal.list_incomplete().unwrap();
|
||
|
|
assert!(
|
||
|
|
incomplete.is_empty(),
|
||
|
|
"WAL must be clean after recovery, found {} incomplete entries",
|
||
|
|
incomplete.len()
|
||
|
|
);
|
||
|
|
|
||
|
|
// Store integrity check must pass
|
||
|
|
let report = karapace_store::verify_store_integrity(&layout).unwrap();
|
||
|
|
assert!(
|
||
|
|
report.failed.is_empty(),
|
||
|
|
"store integrity check found {} failures: {:?}",
|
||
|
|
report.failed.len(),
|
||
|
|
report.failed
|
||
|
|
);
|
||
|
|
|
||
|
|
// All listed environments must be inspectable
|
||
|
|
let envs = engine.list().unwrap();
|
||
|
|
for env in &envs {
|
||
|
|
let meta = engine.inspect(&env.env_id).unwrap();
|
||
|
|
// No environment should be stuck in Running state after crash recovery
|
||
|
|
// (WAL ResetState rollback should have fixed it)
|
||
|
|
assert_ne!(
|
||
|
|
meta.state,
|
||
|
|
karapace_store::EnvState::Running,
|
||
|
|
"env {} stuck in Running after crash recovery",
|
||
|
|
env.env_id
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Lock must be acquirable (proves the dead child released it)
|
||
|
|
let lock = StoreLock::try_acquire(&layout.lock_file()).unwrap();
|
||
|
|
assert!(
|
||
|
|
lock.is_some(),
|
||
|
|
"store lock must be acquirable after child death"
|
||
|
|
);
|
||
|
|
|
||
|
|
// No orphaned env directories (dirs in env/ without matching metadata)
|
||
|
|
let env_base = layout.env_dir();
|
||
|
|
if env_base.exists() {
|
||
|
|
if let Ok(entries) = fs::read_dir(&env_base) {
|
||
|
|
let meta_store = karapace_store::MetadataStore::new(layout.clone());
|
||
|
|
for entry in entries.flatten() {
|
||
|
|
let dir_name = entry.file_name();
|
||
|
|
let dir_name_str = dir_name.to_string_lossy();
|
||
|
|
// Skip dotfiles
|
||
|
|
if dir_name_str.starts_with('.') {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
// Every env dir should have matching metadata (or be cleaned up by WAL)
|
||
|
|
// We don't assert this is always true because the build might have
|
||
|
|
// completed successfully before the kill. But if metadata exists,
|
||
|
|
// it should be readable.
|
||
|
|
if meta_store.get(&dir_name_str).is_ok() {
|
||
|
|
// Metadata exists and is valid — good
|
||
|
|
} else {
|
||
|
|
// Orphaned env dir — WAL should have cleaned it, but if the
|
||
|
|
// build completed and was killed before metadata write,
|
||
|
|
// this is acceptable as long as it doesn't cause errors
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// No stale .running markers
|
||
|
|
if env_base.exists() {
|
||
|
|
if let Ok(entries) = fs::read_dir(&env_base) {
|
||
|
|
for entry in entries.flatten() {
|
||
|
|
let running = entry.path().join(".running");
|
||
|
|
assert!(
|
||
|
|
!running.exists(),
|
||
|
|
"stale .running marker at {}",
|
||
|
|
running.display()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Fork a child that runs `child_fn` in a loop, kill it after `delay`,
|
||
|
|
/// then verify store health in the parent.
|
||
|
|
///
|
||
|
|
/// # Safety
|
||
|
|
/// Uses `libc::fork()` which is inherently unsafe. The child must not
|
||
|
|
/// return — it either loops forever or exits.
|
||
|
|
unsafe fn crash_test(store_path: &Path, delay: std::time::Duration, child_fn: fn(&Path)) {
|
||
|
|
let pid = libc::fork();
|
||
|
|
assert!(pid >= 0, "fork() failed");
|
||
|
|
|
||
|
|
if pid == 0 {
|
||
|
|
// === CHILD PROCESS ===
|
||
|
|
// Run the operation in a tight loop until killed
|
||
|
|
child_fn(store_path);
|
||
|
|
// If child_fn returns, exit immediately
|
||
|
|
libc::_exit(0);
|
||
|
|
}
|
||
|
|
|
||
|
|
// === PARENT PROCESS ===
|
||
|
|
std::thread::sleep(delay);
|
||
|
|
|
||
|
|
// Send SIGKILL — no chance for cleanup
|
||
|
|
let ret = libc::kill(pid, libc::SIGKILL);
|
||
|
|
assert_eq!(ret, 0, "kill() failed");
|
||
|
|
|
||
|
|
// Wait for child to die
|
||
|
|
let mut status: i32 = 0;
|
||
|
|
let waited = libc::waitpid(pid, &raw mut status, 0);
|
||
|
|
assert_eq!(waited, pid, "waitpid() failed");
|
||
|
|
|
||
|
|
// Now verify the store
|
||
|
|
verify_store_healthy(store_path);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Child function: build environments in a tight loop
|
||
|
|
fn child_build_loop(store_path: &Path) {
|
||
|
|
let project = tempfile::tempdir().unwrap();
|
||
|
|
let engine = Engine::new(store_path);
|
||
|
|
|
||
|
|
for i in 0u64.. {
|
||
|
|
let pkgs: Vec<String> = (0..=(i % 4)).map(|j| format!("pkg{j}")).collect();
|
||
|
|
let pkg_refs: Vec<&str> = pkgs.iter().map(String::as_str).collect();
|
||
|
|
let manifest = write_manifest(project.path(), &mock_manifest(&pkg_refs));
|
||
|
|
let _ = engine.build(&manifest);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Child function: build + destroy in a tight loop
|
||
|
|
fn child_build_destroy_loop(store_path: &Path) {
|
||
|
|
let project = tempfile::tempdir().unwrap();
|
||
|
|
let engine = Engine::new(store_path);
|
||
|
|
|
||
|
|
for i in 0u64.. {
|
||
|
|
let pkgs: Vec<String> = (0..=(i % 2)).map(|j| format!("pkg{j}")).collect();
|
||
|
|
let pkg_refs: Vec<&str> = pkgs.iter().map(String::as_str).collect();
|
||
|
|
let manifest = write_manifest(project.path(), &mock_manifest(&pkg_refs));
|
||
|
|
if let Ok(r) = engine.build(&manifest) {
|
||
|
|
let _ = engine.destroy(&r.identity.env_id);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Child function: build + commit in a tight loop
|
||
|
|
fn child_build_commit_loop(store_path: &Path) {
|
||
|
|
let project = tempfile::tempdir().unwrap();
|
||
|
|
let engine = Engine::new(store_path);
|
||
|
|
|
||
|
|
let manifest = write_manifest(project.path(), &mock_manifest(&["git"]));
|
||
|
|
if let Ok(r) = engine.build(&manifest) {
|
||
|
|
let env_id = r.identity.env_id.to_string();
|
||
|
|
let upper = store_path.join("env").join(&env_id).join("upper");
|
||
|
|
let _ = fs::create_dir_all(&upper);
|
||
|
|
|
||
|
|
for i in 0u64.. {
|
||
|
|
let _ = fs::write(upper.join(format!("file_{i}.txt")), format!("data {i}"));
|
||
|
|
let _ = engine.commit(&env_id);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Child function: build + commit + restore in a tight loop
|
||
|
|
fn child_commit_restore_loop(store_path: &Path) {
|
||
|
|
let project = tempfile::tempdir().unwrap();
|
||
|
|
let engine = Engine::new(store_path);
|
||
|
|
|
||
|
|
let manifest = write_manifest(project.path(), &mock_manifest(&["git"]));
|
||
|
|
if let Ok(r) = engine.build(&manifest) {
|
||
|
|
let env_id = r.identity.env_id.to_string();
|
||
|
|
let upper = store_path.join("env").join(&env_id).join("upper");
|
||
|
|
let _ = fs::create_dir_all(&upper);
|
||
|
|
|
||
|
|
// Create initial snapshot
|
||
|
|
let _ = fs::write(upper.join("base.txt"), "base content");
|
||
|
|
if let Ok(snap_hash) = engine.commit(&env_id) {
|
||
|
|
for i in 0u64.. {
|
||
|
|
let _ = fs::write(upper.join(format!("file_{i}.txt")), format!("data {i}"));
|
||
|
|
let _ = engine.commit(&env_id);
|
||
|
|
let _ = engine.restore(&env_id, &snap_hash);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Child function: build + GC in a tight loop
|
||
|
|
fn child_gc_loop(store_path: &Path) {
|
||
|
|
let project = tempfile::tempdir().unwrap();
|
||
|
|
let engine = Engine::new(store_path);
|
||
|
|
|
||
|
|
// Build several environments
|
||
|
|
let mut env_ids = Vec::new();
|
||
|
|
for i in 0..5 {
|
||
|
|
let pkgs: Vec<String> = (0..=i).map(|j| format!("pkg{j}")).collect();
|
||
|
|
let pkg_refs: Vec<&str> = pkgs.iter().map(String::as_str).collect();
|
||
|
|
let manifest = write_manifest(project.path(), &mock_manifest(&pkg_refs));
|
||
|
|
if let Ok(r) = engine.build(&manifest) {
|
||
|
|
env_ids.push(r.identity.env_id.to_string());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let layout = StoreLayout::new(store_path);
|
||
|
|
for i in 0u64.. {
|
||
|
|
// Destroy one environment per cycle
|
||
|
|
let idx = (i as usize) % env_ids.len();
|
||
|
|
let _ = engine.destroy(&env_ids[idx]);
|
||
|
|
|
||
|
|
// Run GC
|
||
|
|
if let Ok(Some(lock)) = StoreLock::try_acquire(&layout.lock_file()) {
|
||
|
|
let _ = engine.gc(&lock, false);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rebuild
|
||
|
|
let pkgs: Vec<String> = (0..=idx).map(|j| format!("pkg{j}")).collect();
|
||
|
|
let pkg_refs: Vec<&str> = pkgs.iter().map(String::as_str).collect();
|
||
|
|
let manifest = write_manifest(project.path(), &mock_manifest(&pkg_refs));
|
||
|
|
if let Ok(r) = engine.build(&manifest) {
|
||
|
|
env_ids[idx] = r.identity.env_id.to_string();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Child function: build + enter in a tight loop (tests ResetState WAL)
|
||
|
|
fn child_enter_loop(store_path: &Path) {
|
||
|
|
let project = tempfile::tempdir().unwrap();
|
||
|
|
let engine = Engine::new(store_path);
|
||
|
|
|
||
|
|
let manifest = write_manifest(project.path(), &mock_manifest(&["git"]));
|
||
|
|
if let Ok(r) = engine.build(&manifest) {
|
||
|
|
let env_id = r.identity.env_id.to_string();
|
||
|
|
for _ in 0u64.. {
|
||
|
|
let _ = engine.enter(&env_id);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// --- Crash tests ---
|
||
|
|
// Each test runs with multiple delay values to increase the chance of hitting
|
||
|
|
// different points in the operation lifecycle.
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn crash_during_build() {
|
||
|
|
for delay_ms in [1, 5, 10, 20, 50] {
|
||
|
|
let store = tempfile::tempdir().unwrap();
|
||
|
|
// Pre-initialize the store
|
||
|
|
let layout = StoreLayout::new(store.path());
|
||
|
|
layout.initialize().unwrap();
|
||
|
|
|
||
|
|
unsafe {
|
||
|
|
crash_test(
|
||
|
|
store.path(),
|
||
|
|
std::time::Duration::from_millis(delay_ms),
|
||
|
|
child_build_loop,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn crash_during_build_destroy() {
|
||
|
|
for delay_ms in [1, 5, 10, 20, 50] {
|
||
|
|
let store = tempfile::tempdir().unwrap();
|
||
|
|
let layout = StoreLayout::new(store.path());
|
||
|
|
layout.initialize().unwrap();
|
||
|
|
|
||
|
|
unsafe {
|
||
|
|
crash_test(
|
||
|
|
store.path(),
|
||
|
|
std::time::Duration::from_millis(delay_ms),
|
||
|
|
child_build_destroy_loop,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn crash_during_commit() {
|
||
|
|
for delay_ms in [1, 5, 10, 20, 50] {
|
||
|
|
let store = tempfile::tempdir().unwrap();
|
||
|
|
let layout = StoreLayout::new(store.path());
|
||
|
|
layout.initialize().unwrap();
|
||
|
|
|
||
|
|
unsafe {
|
||
|
|
crash_test(
|
||
|
|
store.path(),
|
||
|
|
std::time::Duration::from_millis(delay_ms),
|
||
|
|
child_build_commit_loop,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn crash_during_restore() {
|
||
|
|
for delay_ms in [1, 5, 10, 20, 50] {
|
||
|
|
let store = tempfile::tempdir().unwrap();
|
||
|
|
let layout = StoreLayout::new(store.path());
|
||
|
|
layout.initialize().unwrap();
|
||
|
|
|
||
|
|
unsafe {
|
||
|
|
crash_test(
|
||
|
|
store.path(),
|
||
|
|
std::time::Duration::from_millis(delay_ms),
|
||
|
|
child_commit_restore_loop,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn crash_during_gc() {
|
||
|
|
for delay_ms in [5, 10, 20, 50, 100] {
|
||
|
|
let store = tempfile::tempdir().unwrap();
|
||
|
|
let layout = StoreLayout::new(store.path());
|
||
|
|
layout.initialize().unwrap();
|
||
|
|
|
||
|
|
unsafe {
|
||
|
|
crash_test(
|
||
|
|
store.path(),
|
||
|
|
std::time::Duration::from_millis(delay_ms),
|
||
|
|
child_gc_loop,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn crash_during_enter() {
|
||
|
|
for delay_ms in [1, 5, 10, 20, 50] {
|
||
|
|
let store = tempfile::tempdir().unwrap();
|
||
|
|
let layout = StoreLayout::new(store.path());
|
||
|
|
layout.initialize().unwrap();
|
||
|
|
|
||
|
|
unsafe {
|
||
|
|
crash_test(
|
||
|
|
store.path(),
|
||
|
|
std::time::Duration::from_millis(delay_ms),
|
||
|
|
child_enter_loop,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|