fix: harden enter/stop and WAL recovery

- Guard WAL recovery and stale .running cleanup behind a try-acquired store lock\n- Persist rollback ResetState via MetadataStore to recompute checksums\n- Track a killable host PID for namespace enter/stop and treat SIGTERM/SIGKILL as clean exit\n- Derive OCI status PID via runtime state output\n- Make sandbox chroot script quoting robust for exec/enter
This commit is contained in:
Marco Allegretti 2026-02-25 11:48:58 +01:00
parent 5ac58ba575
commit 961209ef0a
8 changed files with 266 additions and 95 deletions

View file

@ -4,7 +4,7 @@ use karapace_schema::manifest::{
parse_manifest_str, BaseSection, GuiSection, HardwareSection, ManifestV1, MountsSection,
RuntimeSection, SystemSection,
};
use std::io::{stdin, IsTerminal};
use std::io::{stderr, stdin, IsTerminal};
use std::path::{Path, PathBuf};
use tempfile::NamedTempFile;
@ -87,12 +87,14 @@ fn print_result(name: &str, template: Option<&str>, json: bool) -> Result<(), St
pub fn run(name: &str, template: Option<&str>, force: bool, json: bool) -> Result<u8, String> {
let dest = Path::new(DEST_MANIFEST);
let is_tty = stdin().is_terminal();
ensure_can_write(dest, force, is_tty)?;
let is_tty = stdin().is_terminal() && stderr().is_terminal();
let mut manifest = if let Some(tpl) = template {
load_template(tpl)?
let m = load_template(tpl)?;
ensure_can_write(dest, force, is_tty)?;
m
} else {
ensure_can_write(dest, force, is_tty)?;
if !is_tty {
return Err("no --template provided and stdin is not a TTY".to_owned());
}

View file

@ -1,12 +1,8 @@
use super::{resolve_env_id_pretty, EXIT_SUCCESS};
use karapace_core::{Engine, StoreLock};
use karapace_store::StoreLayout;
use karapace_core::Engine;
use std::path::Path;
pub fn run(engine: &Engine, store_path: &Path, env_id: &str) -> Result<u8, String> {
let layout = StoreLayout::new(store_path);
let _lock = StoreLock::acquire(&layout.lock_file()).map_err(|e| format!("store lock: {e}"))?;
pub fn run(engine: &Engine, _store_path: &Path, env_id: &str) -> Result<u8, String> {
let resolved = resolve_env_id_pretty(engine, env_id)?;
engine.stop(&resolved).map_err(|e| e.to_string())?;
println!("stopped environment {env_id}");

View file

@ -56,27 +56,39 @@ impl Engine {
let layer_store = LayerStore::new(layout.clone());
let wal = WriteAheadLog::new(&layout);
// Recover any incomplete operations from a previous crash
if let Err(e) = wal.recover() {
warn!("WAL recovery failed: {e}");
}
// Recovery mutates the store and must not run concurrently with a live
// operation holding the store lock (e.g. an interactive `enter`).
match StoreLock::try_acquire(&layout.lock_file()) {
Ok(Some(_lock)) => {
// Recover any incomplete operations from a previous crash
if let Err(e) = wal.recover() {
warn!("WAL recovery failed: {e}");
}
// Clean up stale .running markers left by a crash during enter/exec.
// After WAL recovery, any env still marked Running was mid-operation.
let env_base = layout.env_dir();
if env_base.exists() {
if let Ok(entries) = std::fs::read_dir(&env_base) {
for entry in entries.flatten() {
let running_marker = entry.path().join(".running");
if running_marker.exists() {
debug!(
"removing stale .running marker: {}",
running_marker.display()
);
let _ = std::fs::remove_file(&running_marker);
// Clean up stale .running markers left by a crash during enter/exec.
// After WAL recovery, any env still marked Running was mid-operation.
let env_base = layout.env_dir();
if env_base.exists() {
if let Ok(entries) = std::fs::read_dir(&env_base) {
for entry in entries.flatten() {
let running_marker = entry.path().join(".running");
if running_marker.exists() {
debug!(
"removing stale .running marker: {}",
running_marker.display()
);
let _ = std::fs::remove_file(&running_marker);
}
}
}
}
}
Ok(None) => {
debug!("store lock held; skipping WAL recovery and stale marker cleanup");
}
Err(e) => {
warn!("store lock check failed; skipping WAL recovery: {e}");
}
}
let store_root_str = root.to_string_lossy().into_owned();

View file

@ -1,4 +1,5 @@
use crate::{BlobKind, RemoteBackend, RemoteConfig, RemoteError};
use std::io::Read;
/// HTTP-based remote store backend.
///
@ -56,9 +57,10 @@ impl HttpBackend {
req = req.header("Authorization", &format!("Bearer {token}"));
}
let resp = req.call().map_err(|e| RemoteError::Http(e.to_string()))?;
let body = resp
.into_body()
.read_to_vec()
let mut reader = resp.into_body().into_reader();
let mut body = Vec::new();
reader
.read_to_end(&mut body)
.map_err(|e| RemoteError::Http(e.to_string()))?;
Ok(body)
}
@ -195,7 +197,7 @@ mod tests {
let mut body = vec![0u8; content_length];
if content_length > 0 {
let _ = std::io::Read::read_exact(&mut reader, &mut body);
let _ = reader.read_exact(&mut body);
}
let mut data = store.lock().unwrap();

View file

@ -5,12 +5,14 @@ use crate::image::{
parse_version_output, query_versions_command, resolve_image, ImageCache,
};
use crate::sandbox::{
enter_interactive, exec_in_container, install_packages_in_container, mount_overlay,
setup_container_rootfs, unmount_overlay, SandboxConfig,
exec_in_container, install_packages_in_container, mount_overlay, setup_container_rootfs,
spawn_enter_interactive, unmount_overlay, SandboxConfig,
};
use crate::terminal;
use crate::RuntimeError;
use karapace_schema::{ResolutionResult, ResolvedPackage};
use libc::{SIGKILL, SIGTERM};
use std::os::unix::process::ExitStatusExt;
use std::path::{Path, PathBuf};
pub struct NamespaceBackend {
@ -237,13 +239,8 @@ impl RuntimeBackend for NamespaceBackend {
// Mount overlay
mount_overlay(&sandbox)?;
// Set up rootfs
setup_container_rootfs(&sandbox)?;
// Mark as running
std::fs::write(env_dir.join(".running"), format!("{}", std::process::id()))?;
// Emit terminal markers
terminal::emit_container_push(&spec.env_id, &sandbox.hostname);
terminal::print_container_banner(
@ -252,8 +249,38 @@ impl RuntimeBackend for NamespaceBackend {
&sandbox.hostname,
);
// Enter the container interactively
let exit_code = enter_interactive(&sandbox);
// Spawn the sandbox so we can record the host PID for `stop`.
let mut child = match spawn_enter_interactive(&sandbox) {
Ok(c) => c,
Err(e) => {
terminal::emit_container_pop();
terminal::print_container_exit(&spec.env_id);
let _ = unmount_overlay(&sandbox);
return Err(e);
}
};
if let Err(e) = std::fs::write(env_dir.join(".running"), format!("{}", child.id())) {
let _ = child.kill();
terminal::emit_container_pop();
terminal::print_container_exit(&spec.env_id);
let _ = unmount_overlay(&sandbox);
return Err(e.into());
}
// Wait for the interactive session to complete.
let exit_code = match child.wait() {
Ok(status) => {
let code = status.code().unwrap_or_else(|| match status.signal() {
Some(sig) if sig == SIGTERM || sig == SIGKILL => 0,
_ => 1,
});
Ok(code)
}
Err(e) => Err(RuntimeError::ExecFailed(format!(
"failed to wait for sandbox: {e}"
))),
};
// Cleanup
terminal::emit_container_pop();

View file

@ -11,7 +11,7 @@ use crate::sandbox::{
use crate::terminal;
use crate::RuntimeError;
use karapace_schema::{ResolutionResult, ResolvedPackage};
use std::path::{Path, PathBuf};
use std::path::PathBuf;
use std::process::Command;
pub struct OciBackend {
@ -451,35 +451,45 @@ impl RuntimeBackend for OciBackend {
}
fn status(&self, env_id: &str) -> Result<RuntimeStatus, RuntimeError> {
let env_dir = self.env_dir(env_id);
let running_file = env_dir.join(".running");
let runtime = Self::find_runtime().ok_or_else(|| {
RuntimeError::BackendUnavailable("no OCI runtime found (crun/runc/youki)".to_owned())
})?;
if running_file.exists() {
let pid_str = std::fs::read_to_string(&running_file).unwrap_or_default();
let pid = pid_str.trim().parse::<u32>().ok();
if pid.is_none() && !pid_str.trim().is_empty() {
tracing::warn!(
"corrupt .running file for {}: could not parse PID from '{}'",
&env_id[..12.min(env_id.len())],
pid_str.trim()
);
}
if let Some(p) = pid {
if Path::new(&format!("/proc/{p}")).exists() {
return Ok(RuntimeStatus {
env_id: env_id.to_owned(),
running: true,
pid: Some(p),
});
}
let _ = std::fs::remove_file(&running_file);
let container_id = format!("karapace-{}", &env_id[..12.min(env_id.len())]);
let output = Command::new(&runtime)
.args(["state", &container_id])
.output()?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let msg = stderr.to_lowercase();
if msg.contains("does not exist") || msg.contains("not found") {
return Ok(RuntimeStatus {
env_id: env_id.to_owned(),
running: false,
pid: None,
});
}
return Err(RuntimeError::ExecFailed(format!(
"{runtime} state failed: {}",
stderr.trim()
)));
}
let state: serde_json::Value = serde_json::from_slice(&output.stdout).map_err(|e| {
RuntimeError::ExecFailed(format!("failed to parse {runtime} state output: {e}"))
})?;
let pid = state
.get("pid")
.and_then(serde_json::Value::as_u64)
.and_then(|p| u32::try_from(p).ok())
.filter(|p| *p != 0);
Ok(RuntimeStatus {
env_id: env_id.to_owned(),
running: false,
pid: None,
running: pid.is_some(),
pid,
})
}
}

View file

@ -248,7 +248,14 @@ fn ensure_user_in_container(config: &SandboxConfig, merged: &Path) -> Result<(),
fn build_unshare_command(config: &SandboxConfig) -> Command {
let mut cmd = Command::new("unshare");
cmd.args(["--user", "--map-root-user", "--mount", "--pid", "--fork"]);
cmd.args([
"--user",
"--map-root-user",
"--mount",
"--pid",
"--fork",
"--kill-child=SIGTERM",
]);
if config.isolate_network {
cmd.arg("--net");
@ -343,7 +350,7 @@ fn build_setup_script(config: &SandboxConfig) -> String {
}
// Chroot and exec
let _ = write!(script, "exec chroot {qm} /bin/sh -c '");
let _ = writeln!(script, "exec chroot {qm} /bin/sh -s <<'__KARAPACE_EOF__'");
script
}
@ -409,7 +416,10 @@ pub fn enter_interactive(config: &SandboxConfig) -> Result<i32, RuntimeError> {
"/bin/sh"
};
let _ = write!(setup, "{env_exports}cd ~; exec {shell} -l'");
let _ = write!(
setup,
"{env_exports}cd ~; exec {shell} -l </dev/tty >/dev/tty 2>/dev/tty\n__KARAPACE_EOF__\n"
);
let mut cmd = build_unshare_command(config);
cmd.arg("/bin/sh").arg("-c").arg(&setup);
@ -426,6 +436,86 @@ pub fn enter_interactive(config: &SandboxConfig) -> Result<i32, RuntimeError> {
Ok(status.code().unwrap_or(1))
}
pub fn spawn_enter_interactive(
config: &SandboxConfig,
) -> Result<std::process::Child, RuntimeError> {
let merged = &config.overlay_merged;
let mut setup = build_setup_script(config);
// Build environment variable exports (all values shell-quoted, keys validated)
let mut env_exports = String::new();
for (key, val) in &config.env_vars {
if !key.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_') {
continue; // Skip keys with unsafe characters
}
let _ = write!(env_exports, "export {}={}; ", key, shell_quote(val));
}
// Set standard env vars (all values shell-quoted)
let _ = write!(
env_exports,
"export HOME={}; ",
shell_quote_path(&config.home_dir)
);
let _ = write!(
env_exports,
"export USER={}; ",
shell_quote(&config.username)
);
let _ = write!(
env_exports,
"export HOSTNAME={}; ",
shell_quote(&config.hostname)
);
if let Ok(xdg) = std::env::var("XDG_RUNTIME_DIR") {
let _ = write!(
env_exports,
"export XDG_RUNTIME_DIR={}; ",
shell_quote(&xdg)
);
}
if let Ok(display) = std::env::var("DISPLAY") {
let _ = write!(env_exports, "export DISPLAY={}; ", shell_quote(&display));
}
if let Ok(wayland) = std::env::var("WAYLAND_DISPLAY") {
let _ = write!(
env_exports,
"export WAYLAND_DISPLAY={}; ",
shell_quote(&wayland)
);
}
env_exports.push_str("export TERM=${TERM:-xterm-256color}; ");
let _ = write!(
env_exports,
"export KARAPACE_ENV=1; export KARAPACE_HOSTNAME={}; ",
shell_quote(&config.hostname)
);
// Determine shell
let shell = if merged.join("bin/bash").exists() || merged.join("usr/bin/bash").exists() {
"/bin/bash"
} else {
"/bin/sh"
};
let _ = write!(
setup,
"{env_exports}cd ~; exec {shell} -l </dev/tty >/dev/tty 2>/dev/tty\n__KARAPACE_EOF__\n"
);
let mut cmd = build_unshare_command(config);
cmd.arg("/bin/sh").arg("-c").arg(&setup);
// Pass through stdin/stdout/stderr for interactive use
cmd.stdin(std::process::Stdio::inherit());
cmd.stdout(std::process::Stdio::inherit());
cmd.stderr(std::process::Stdio::inherit());
cmd.spawn()
.map_err(|e| RuntimeError::ExecFailed(format!("failed to spawn sandbox: {e}")))
}
pub fn exec_in_container(
config: &SandboxConfig,
command: &[String],
@ -453,7 +543,11 @@ pub fn exec_in_container(
env_exports.push_str("export KARAPACE_ENV=1; ");
let escaped_cmd: Vec<String> = command.iter().map(|a| shell_quote(a)).collect();
let _ = write!(setup, "{env_exports}{}'", escaped_cmd.join(" "));
let _ = write!(
setup,
"{env_exports}{}\n__KARAPACE_EOF__\n",
escaped_cmd.join(" ")
);
let mut cmd = build_unshare_command(config);
cmd.arg("/bin/sh").arg("-c").arg(&setup);

View file

@ -1,4 +1,5 @@
use crate::layout::StoreLayout;
use crate::metadata::{EnvMetadata, EnvState, MetadataStore};
use crate::StoreError;
use serde::{Deserialize, Serialize};
use std::fs;
@ -7,6 +8,17 @@ use std::path::PathBuf;
use tempfile::NamedTempFile;
use tracing::{debug, info, warn};
fn parse_env_state(s: &str) -> Option<EnvState> {
match s {
"Defined" | "defined" => Some(EnvState::Defined),
"Built" | "built" => Some(EnvState::Built),
"Running" | "running" => Some(EnvState::Running),
"Frozen" | "frozen" => Some(EnvState::Frozen),
"Archived" | "archived" => Some(EnvState::Archived),
_ => None,
}
}
/// A single rollback step that can undo part of an operation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RollbackStep {
@ -196,34 +208,50 @@ impl WriteAheadLog {
env_id,
target_state,
} => {
// Resolve metadata dir from wal_dir (wal_dir = root/store/wal)
if let Some(store_dir) = self.wal_dir.parent() {
let metadata_dir = store_dir.join("metadata");
let meta_path = metadata_dir.join(env_id);
if meta_path.exists() {
match fs::read_to_string(&meta_path) {
Ok(content) => {
if let Ok(mut meta) =
serde_json::from_str::<serde_json::Value>(&content)
{
meta["state"] =
serde_json::Value::String(target_state.clone());
if let Ok(updated) = serde_json::to_string_pretty(&meta) {
if let Err(e) = fs::write(&meta_path, updated) {
warn!("WAL rollback: failed to reset state for {env_id}: {e}");
} else {
debug!("WAL rollback: reset {env_id} state to {target_state}");
}
}
}
}
Err(e) => {
warn!(
"WAL rollback: failed to read metadata for {env_id}: {e}"
);
}
}
let Some(new_state) = parse_env_state(target_state) else {
warn!("WAL rollback: unknown target state '{target_state}' for {env_id}");
continue;
};
// wal_dir = <root>/store/wal
let Some(store_dir) = self.wal_dir.parent() else {
continue;
};
let Some(root_dir) = store_dir.parent() else {
continue;
};
let meta_path = store_dir.join("metadata").join(env_id);
if !meta_path.exists() {
continue;
}
let content = match fs::read_to_string(&meta_path) {
Ok(c) => c,
Err(e) => {
warn!("WAL rollback: failed to read metadata for {env_id}: {e}");
continue;
}
};
let mut meta: EnvMetadata = match serde_json::from_str(&content) {
Ok(m) => m,
Err(e) => {
warn!("WAL rollback: failed to parse metadata for {env_id}: {e}");
continue;
}
};
meta.state = new_state;
meta.updated_at = chrono::Utc::now().to_rfc3339();
meta.checksum = None;
let layout = StoreLayout::new(root_dir);
let meta_store = MetadataStore::new(layout);
if let Err(e) = meta_store.put(&meta) {
warn!("WAL rollback: failed to persist metadata for {env_id}: {e}");
} else {
debug!("WAL rollback: reset {env_id} state to {target_state}");
}
}
}