mirror of
https://github.com/marcoallegretti/karapace.git
synced 2026-03-26 21:43:09 +00:00
fix: harden enter/stop and WAL recovery
- Guard WAL recovery and stale .running cleanup behind a try-acquired store lock\n- Persist rollback ResetState via MetadataStore to recompute checksums\n- Track a killable host PID for namespace enter/stop and treat SIGTERM/SIGKILL as clean exit\n- Derive OCI status PID via runtime state output\n- Make sandbox chroot script quoting robust for exec/enter
This commit is contained in:
parent
5ac58ba575
commit
961209ef0a
8 changed files with 266 additions and 95 deletions
|
|
@ -4,7 +4,7 @@ use karapace_schema::manifest::{
|
|||
parse_manifest_str, BaseSection, GuiSection, HardwareSection, ManifestV1, MountsSection,
|
||||
RuntimeSection, SystemSection,
|
||||
};
|
||||
use std::io::{stdin, IsTerminal};
|
||||
use std::io::{stderr, stdin, IsTerminal};
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
|
|
@ -87,12 +87,14 @@ fn print_result(name: &str, template: Option<&str>, json: bool) -> Result<(), St
|
|||
|
||||
pub fn run(name: &str, template: Option<&str>, force: bool, json: bool) -> Result<u8, String> {
|
||||
let dest = Path::new(DEST_MANIFEST);
|
||||
let is_tty = stdin().is_terminal();
|
||||
ensure_can_write(dest, force, is_tty)?;
|
||||
let is_tty = stdin().is_terminal() && stderr().is_terminal();
|
||||
|
||||
let mut manifest = if let Some(tpl) = template {
|
||||
load_template(tpl)?
|
||||
let m = load_template(tpl)?;
|
||||
ensure_can_write(dest, force, is_tty)?;
|
||||
m
|
||||
} else {
|
||||
ensure_can_write(dest, force, is_tty)?;
|
||||
if !is_tty {
|
||||
return Err("no --template provided and stdin is not a TTY".to_owned());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,8 @@
|
|||
use super::{resolve_env_id_pretty, EXIT_SUCCESS};
|
||||
use karapace_core::{Engine, StoreLock};
|
||||
use karapace_store::StoreLayout;
|
||||
use karapace_core::Engine;
|
||||
use std::path::Path;
|
||||
|
||||
pub fn run(engine: &Engine, store_path: &Path, env_id: &str) -> Result<u8, String> {
|
||||
let layout = StoreLayout::new(store_path);
|
||||
let _lock = StoreLock::acquire(&layout.lock_file()).map_err(|e| format!("store lock: {e}"))?;
|
||||
|
||||
pub fn run(engine: &Engine, _store_path: &Path, env_id: &str) -> Result<u8, String> {
|
||||
let resolved = resolve_env_id_pretty(engine, env_id)?;
|
||||
engine.stop(&resolved).map_err(|e| e.to_string())?;
|
||||
println!("stopped environment {env_id}");
|
||||
|
|
|
|||
|
|
@ -56,27 +56,39 @@ impl Engine {
|
|||
let layer_store = LayerStore::new(layout.clone());
|
||||
let wal = WriteAheadLog::new(&layout);
|
||||
|
||||
// Recover any incomplete operations from a previous crash
|
||||
if let Err(e) = wal.recover() {
|
||||
warn!("WAL recovery failed: {e}");
|
||||
}
|
||||
// Recovery mutates the store and must not run concurrently with a live
|
||||
// operation holding the store lock (e.g. an interactive `enter`).
|
||||
match StoreLock::try_acquire(&layout.lock_file()) {
|
||||
Ok(Some(_lock)) => {
|
||||
// Recover any incomplete operations from a previous crash
|
||||
if let Err(e) = wal.recover() {
|
||||
warn!("WAL recovery failed: {e}");
|
||||
}
|
||||
|
||||
// Clean up stale .running markers left by a crash during enter/exec.
|
||||
// After WAL recovery, any env still marked Running was mid-operation.
|
||||
let env_base = layout.env_dir();
|
||||
if env_base.exists() {
|
||||
if let Ok(entries) = std::fs::read_dir(&env_base) {
|
||||
for entry in entries.flatten() {
|
||||
let running_marker = entry.path().join(".running");
|
||||
if running_marker.exists() {
|
||||
debug!(
|
||||
"removing stale .running marker: {}",
|
||||
running_marker.display()
|
||||
);
|
||||
let _ = std::fs::remove_file(&running_marker);
|
||||
// Clean up stale .running markers left by a crash during enter/exec.
|
||||
// After WAL recovery, any env still marked Running was mid-operation.
|
||||
let env_base = layout.env_dir();
|
||||
if env_base.exists() {
|
||||
if let Ok(entries) = std::fs::read_dir(&env_base) {
|
||||
for entry in entries.flatten() {
|
||||
let running_marker = entry.path().join(".running");
|
||||
if running_marker.exists() {
|
||||
debug!(
|
||||
"removing stale .running marker: {}",
|
||||
running_marker.display()
|
||||
);
|
||||
let _ = std::fs::remove_file(&running_marker);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
debug!("store lock held; skipping WAL recovery and stale marker cleanup");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("store lock check failed; skipping WAL recovery: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
let store_root_str = root.to_string_lossy().into_owned();
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use crate::{BlobKind, RemoteBackend, RemoteConfig, RemoteError};
|
||||
use std::io::Read;
|
||||
|
||||
/// HTTP-based remote store backend.
|
||||
///
|
||||
|
|
@ -56,9 +57,10 @@ impl HttpBackend {
|
|||
req = req.header("Authorization", &format!("Bearer {token}"));
|
||||
}
|
||||
let resp = req.call().map_err(|e| RemoteError::Http(e.to_string()))?;
|
||||
let body = resp
|
||||
.into_body()
|
||||
.read_to_vec()
|
||||
let mut reader = resp.into_body().into_reader();
|
||||
let mut body = Vec::new();
|
||||
reader
|
||||
.read_to_end(&mut body)
|
||||
.map_err(|e| RemoteError::Http(e.to_string()))?;
|
||||
Ok(body)
|
||||
}
|
||||
|
|
@ -195,7 +197,7 @@ mod tests {
|
|||
|
||||
let mut body = vec![0u8; content_length];
|
||||
if content_length > 0 {
|
||||
let _ = std::io::Read::read_exact(&mut reader, &mut body);
|
||||
let _ = reader.read_exact(&mut body);
|
||||
}
|
||||
|
||||
let mut data = store.lock().unwrap();
|
||||
|
|
|
|||
|
|
@ -5,12 +5,14 @@ use crate::image::{
|
|||
parse_version_output, query_versions_command, resolve_image, ImageCache,
|
||||
};
|
||||
use crate::sandbox::{
|
||||
enter_interactive, exec_in_container, install_packages_in_container, mount_overlay,
|
||||
setup_container_rootfs, unmount_overlay, SandboxConfig,
|
||||
exec_in_container, install_packages_in_container, mount_overlay, setup_container_rootfs,
|
||||
spawn_enter_interactive, unmount_overlay, SandboxConfig,
|
||||
};
|
||||
use crate::terminal;
|
||||
use crate::RuntimeError;
|
||||
use karapace_schema::{ResolutionResult, ResolvedPackage};
|
||||
use libc::{SIGKILL, SIGTERM};
|
||||
use std::os::unix::process::ExitStatusExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
pub struct NamespaceBackend {
|
||||
|
|
@ -237,13 +239,8 @@ impl RuntimeBackend for NamespaceBackend {
|
|||
|
||||
// Mount overlay
|
||||
mount_overlay(&sandbox)?;
|
||||
|
||||
// Set up rootfs
|
||||
setup_container_rootfs(&sandbox)?;
|
||||
|
||||
// Mark as running
|
||||
std::fs::write(env_dir.join(".running"), format!("{}", std::process::id()))?;
|
||||
|
||||
// Emit terminal markers
|
||||
terminal::emit_container_push(&spec.env_id, &sandbox.hostname);
|
||||
terminal::print_container_banner(
|
||||
|
|
@ -252,8 +249,38 @@ impl RuntimeBackend for NamespaceBackend {
|
|||
&sandbox.hostname,
|
||||
);
|
||||
|
||||
// Enter the container interactively
|
||||
let exit_code = enter_interactive(&sandbox);
|
||||
// Spawn the sandbox so we can record the host PID for `stop`.
|
||||
let mut child = match spawn_enter_interactive(&sandbox) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
terminal::emit_container_pop();
|
||||
terminal::print_container_exit(&spec.env_id);
|
||||
let _ = unmount_overlay(&sandbox);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = std::fs::write(env_dir.join(".running"), format!("{}", child.id())) {
|
||||
let _ = child.kill();
|
||||
terminal::emit_container_pop();
|
||||
terminal::print_container_exit(&spec.env_id);
|
||||
let _ = unmount_overlay(&sandbox);
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
// Wait for the interactive session to complete.
|
||||
let exit_code = match child.wait() {
|
||||
Ok(status) => {
|
||||
let code = status.code().unwrap_or_else(|| match status.signal() {
|
||||
Some(sig) if sig == SIGTERM || sig == SIGKILL => 0,
|
||||
_ => 1,
|
||||
});
|
||||
Ok(code)
|
||||
}
|
||||
Err(e) => Err(RuntimeError::ExecFailed(format!(
|
||||
"failed to wait for sandbox: {e}"
|
||||
))),
|
||||
};
|
||||
|
||||
// Cleanup
|
||||
terminal::emit_container_pop();
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ use crate::sandbox::{
|
|||
use crate::terminal;
|
||||
use crate::RuntimeError;
|
||||
use karapace_schema::{ResolutionResult, ResolvedPackage};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
pub struct OciBackend {
|
||||
|
|
@ -451,35 +451,45 @@ impl RuntimeBackend for OciBackend {
|
|||
}
|
||||
|
||||
fn status(&self, env_id: &str) -> Result<RuntimeStatus, RuntimeError> {
|
||||
let env_dir = self.env_dir(env_id);
|
||||
let running_file = env_dir.join(".running");
|
||||
let runtime = Self::find_runtime().ok_or_else(|| {
|
||||
RuntimeError::BackendUnavailable("no OCI runtime found (crun/runc/youki)".to_owned())
|
||||
})?;
|
||||
|
||||
if running_file.exists() {
|
||||
let pid_str = std::fs::read_to_string(&running_file).unwrap_or_default();
|
||||
let pid = pid_str.trim().parse::<u32>().ok();
|
||||
if pid.is_none() && !pid_str.trim().is_empty() {
|
||||
tracing::warn!(
|
||||
"corrupt .running file for {}: could not parse PID from '{}'",
|
||||
&env_id[..12.min(env_id.len())],
|
||||
pid_str.trim()
|
||||
);
|
||||
}
|
||||
if let Some(p) = pid {
|
||||
if Path::new(&format!("/proc/{p}")).exists() {
|
||||
return Ok(RuntimeStatus {
|
||||
env_id: env_id.to_owned(),
|
||||
running: true,
|
||||
pid: Some(p),
|
||||
});
|
||||
}
|
||||
let _ = std::fs::remove_file(&running_file);
|
||||
let container_id = format!("karapace-{}", &env_id[..12.min(env_id.len())]);
|
||||
let output = Command::new(&runtime)
|
||||
.args(["state", &container_id])
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
let msg = stderr.to_lowercase();
|
||||
if msg.contains("does not exist") || msg.contains("not found") {
|
||||
return Ok(RuntimeStatus {
|
||||
env_id: env_id.to_owned(),
|
||||
running: false,
|
||||
pid: None,
|
||||
});
|
||||
}
|
||||
return Err(RuntimeError::ExecFailed(format!(
|
||||
"{runtime} state failed: {}",
|
||||
stderr.trim()
|
||||
)));
|
||||
}
|
||||
|
||||
let state: serde_json::Value = serde_json::from_slice(&output.stdout).map_err(|e| {
|
||||
RuntimeError::ExecFailed(format!("failed to parse {runtime} state output: {e}"))
|
||||
})?;
|
||||
|
||||
let pid = state
|
||||
.get("pid")
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.and_then(|p| u32::try_from(p).ok())
|
||||
.filter(|p| *p != 0);
|
||||
|
||||
Ok(RuntimeStatus {
|
||||
env_id: env_id.to_owned(),
|
||||
running: false,
|
||||
pid: None,
|
||||
running: pid.is_some(),
|
||||
pid,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -248,7 +248,14 @@ fn ensure_user_in_container(config: &SandboxConfig, merged: &Path) -> Result<(),
|
|||
|
||||
fn build_unshare_command(config: &SandboxConfig) -> Command {
|
||||
let mut cmd = Command::new("unshare");
|
||||
cmd.args(["--user", "--map-root-user", "--mount", "--pid", "--fork"]);
|
||||
cmd.args([
|
||||
"--user",
|
||||
"--map-root-user",
|
||||
"--mount",
|
||||
"--pid",
|
||||
"--fork",
|
||||
"--kill-child=SIGTERM",
|
||||
]);
|
||||
|
||||
if config.isolate_network {
|
||||
cmd.arg("--net");
|
||||
|
|
@ -343,7 +350,7 @@ fn build_setup_script(config: &SandboxConfig) -> String {
|
|||
}
|
||||
|
||||
// Chroot and exec
|
||||
let _ = write!(script, "exec chroot {qm} /bin/sh -c '");
|
||||
let _ = writeln!(script, "exec chroot {qm} /bin/sh -s <<'__KARAPACE_EOF__'");
|
||||
|
||||
script
|
||||
}
|
||||
|
|
@ -409,7 +416,10 @@ pub fn enter_interactive(config: &SandboxConfig) -> Result<i32, RuntimeError> {
|
|||
"/bin/sh"
|
||||
};
|
||||
|
||||
let _ = write!(setup, "{env_exports}cd ~; exec {shell} -l'");
|
||||
let _ = write!(
|
||||
setup,
|
||||
"{env_exports}cd ~; exec {shell} -l </dev/tty >/dev/tty 2>/dev/tty\n__KARAPACE_EOF__\n"
|
||||
);
|
||||
|
||||
let mut cmd = build_unshare_command(config);
|
||||
cmd.arg("/bin/sh").arg("-c").arg(&setup);
|
||||
|
|
@ -426,6 +436,86 @@ pub fn enter_interactive(config: &SandboxConfig) -> Result<i32, RuntimeError> {
|
|||
Ok(status.code().unwrap_or(1))
|
||||
}
|
||||
|
||||
pub fn spawn_enter_interactive(
|
||||
config: &SandboxConfig,
|
||||
) -> Result<std::process::Child, RuntimeError> {
|
||||
let merged = &config.overlay_merged;
|
||||
|
||||
let mut setup = build_setup_script(config);
|
||||
|
||||
// Build environment variable exports (all values shell-quoted, keys validated)
|
||||
let mut env_exports = String::new();
|
||||
for (key, val) in &config.env_vars {
|
||||
if !key.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_') {
|
||||
continue; // Skip keys with unsafe characters
|
||||
}
|
||||
let _ = write!(env_exports, "export {}={}; ", key, shell_quote(val));
|
||||
}
|
||||
|
||||
// Set standard env vars (all values shell-quoted)
|
||||
let _ = write!(
|
||||
env_exports,
|
||||
"export HOME={}; ",
|
||||
shell_quote_path(&config.home_dir)
|
||||
);
|
||||
let _ = write!(
|
||||
env_exports,
|
||||
"export USER={}; ",
|
||||
shell_quote(&config.username)
|
||||
);
|
||||
let _ = write!(
|
||||
env_exports,
|
||||
"export HOSTNAME={}; ",
|
||||
shell_quote(&config.hostname)
|
||||
);
|
||||
if let Ok(xdg) = std::env::var("XDG_RUNTIME_DIR") {
|
||||
let _ = write!(
|
||||
env_exports,
|
||||
"export XDG_RUNTIME_DIR={}; ",
|
||||
shell_quote(&xdg)
|
||||
);
|
||||
}
|
||||
if let Ok(display) = std::env::var("DISPLAY") {
|
||||
let _ = write!(env_exports, "export DISPLAY={}; ", shell_quote(&display));
|
||||
}
|
||||
if let Ok(wayland) = std::env::var("WAYLAND_DISPLAY") {
|
||||
let _ = write!(
|
||||
env_exports,
|
||||
"export WAYLAND_DISPLAY={}; ",
|
||||
shell_quote(&wayland)
|
||||
);
|
||||
}
|
||||
env_exports.push_str("export TERM=${TERM:-xterm-256color}; ");
|
||||
let _ = write!(
|
||||
env_exports,
|
||||
"export KARAPACE_ENV=1; export KARAPACE_HOSTNAME={}; ",
|
||||
shell_quote(&config.hostname)
|
||||
);
|
||||
|
||||
// Determine shell
|
||||
let shell = if merged.join("bin/bash").exists() || merged.join("usr/bin/bash").exists() {
|
||||
"/bin/bash"
|
||||
} else {
|
||||
"/bin/sh"
|
||||
};
|
||||
|
||||
let _ = write!(
|
||||
setup,
|
||||
"{env_exports}cd ~; exec {shell} -l </dev/tty >/dev/tty 2>/dev/tty\n__KARAPACE_EOF__\n"
|
||||
);
|
||||
|
||||
let mut cmd = build_unshare_command(config);
|
||||
cmd.arg("/bin/sh").arg("-c").arg(&setup);
|
||||
|
||||
// Pass through stdin/stdout/stderr for interactive use
|
||||
cmd.stdin(std::process::Stdio::inherit());
|
||||
cmd.stdout(std::process::Stdio::inherit());
|
||||
cmd.stderr(std::process::Stdio::inherit());
|
||||
|
||||
cmd.spawn()
|
||||
.map_err(|e| RuntimeError::ExecFailed(format!("failed to spawn sandbox: {e}")))
|
||||
}
|
||||
|
||||
pub fn exec_in_container(
|
||||
config: &SandboxConfig,
|
||||
command: &[String],
|
||||
|
|
@ -453,7 +543,11 @@ pub fn exec_in_container(
|
|||
env_exports.push_str("export KARAPACE_ENV=1; ");
|
||||
|
||||
let escaped_cmd: Vec<String> = command.iter().map(|a| shell_quote(a)).collect();
|
||||
let _ = write!(setup, "{env_exports}{}'", escaped_cmd.join(" "));
|
||||
let _ = write!(
|
||||
setup,
|
||||
"{env_exports}{}\n__KARAPACE_EOF__\n",
|
||||
escaped_cmd.join(" ")
|
||||
);
|
||||
|
||||
let mut cmd = build_unshare_command(config);
|
||||
cmd.arg("/bin/sh").arg("-c").arg(&setup);
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use crate::layout::StoreLayout;
|
||||
use crate::metadata::{EnvMetadata, EnvState, MetadataStore};
|
||||
use crate::StoreError;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
|
|
@ -7,6 +8,17 @@ use std::path::PathBuf;
|
|||
use tempfile::NamedTempFile;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
fn parse_env_state(s: &str) -> Option<EnvState> {
|
||||
match s {
|
||||
"Defined" | "defined" => Some(EnvState::Defined),
|
||||
"Built" | "built" => Some(EnvState::Built),
|
||||
"Running" | "running" => Some(EnvState::Running),
|
||||
"Frozen" | "frozen" => Some(EnvState::Frozen),
|
||||
"Archived" | "archived" => Some(EnvState::Archived),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// A single rollback step that can undo part of an operation.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum RollbackStep {
|
||||
|
|
@ -196,34 +208,50 @@ impl WriteAheadLog {
|
|||
env_id,
|
||||
target_state,
|
||||
} => {
|
||||
// Resolve metadata dir from wal_dir (wal_dir = root/store/wal)
|
||||
if let Some(store_dir) = self.wal_dir.parent() {
|
||||
let metadata_dir = store_dir.join("metadata");
|
||||
let meta_path = metadata_dir.join(env_id);
|
||||
if meta_path.exists() {
|
||||
match fs::read_to_string(&meta_path) {
|
||||
Ok(content) => {
|
||||
if let Ok(mut meta) =
|
||||
serde_json::from_str::<serde_json::Value>(&content)
|
||||
{
|
||||
meta["state"] =
|
||||
serde_json::Value::String(target_state.clone());
|
||||
if let Ok(updated) = serde_json::to_string_pretty(&meta) {
|
||||
if let Err(e) = fs::write(&meta_path, updated) {
|
||||
warn!("WAL rollback: failed to reset state for {env_id}: {e}");
|
||||
} else {
|
||||
debug!("WAL rollback: reset {env_id} state to {target_state}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"WAL rollback: failed to read metadata for {env_id}: {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
let Some(new_state) = parse_env_state(target_state) else {
|
||||
warn!("WAL rollback: unknown target state '{target_state}' for {env_id}");
|
||||
continue;
|
||||
};
|
||||
|
||||
// wal_dir = <root>/store/wal
|
||||
let Some(store_dir) = self.wal_dir.parent() else {
|
||||
continue;
|
||||
};
|
||||
let Some(root_dir) = store_dir.parent() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let meta_path = store_dir.join("metadata").join(env_id);
|
||||
if !meta_path.exists() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let content = match fs::read_to_string(&meta_path) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
warn!("WAL rollback: failed to read metadata for {env_id}: {e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut meta: EnvMetadata = match serde_json::from_str(&content) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
warn!("WAL rollback: failed to parse metadata for {env_id}: {e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
meta.state = new_state;
|
||||
meta.updated_at = chrono::Utc::now().to_rfc3339();
|
||||
meta.checksum = None;
|
||||
|
||||
let layout = StoreLayout::new(root_dir);
|
||||
let meta_store = MetadataStore::new(layout);
|
||||
if let Err(e) = meta_store.put(&meta) {
|
||||
warn!("WAL rollback: failed to persist metadata for {env_id}: {e}");
|
||||
} else {
|
||||
debug!("WAL rollback: reset {env_id} state to {target_state}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue