fix(core): make stop robust to stale state

- Stop can now succeed when metadata is stale but runtime indicates running\n- Only uses .running PID fallback for namespace/mock; avoids OCI PID misuse\n- Preserves non-Running states (e.g. Frozen) after stopping\n- Adds integration tests for stale metadata and Frozen stop
This commit is contained in:
Marco Allegretti 2026-02-25 19:26:22 +01:00
parent 598d07239e
commit d78a770502
2 changed files with 142 additions and 7 deletions

View file

@ -498,7 +498,58 @@ impl Engine {
.get(env_id)
.map_err(|_| CoreError::EnvNotFound(env_id.to_owned()))?;
if meta.state != EnvState::Running {
let target_state = if meta.state == EnvState::Running {
EnvState::Built
} else {
meta.state
};
let normalized = self.load_manifest(&meta.manifest_hash)?;
let store_str = self.store_root_str.clone();
let backend = select_backend(&normalized.runtime_backend, &store_str)?;
let running_file = self.layout.env_path(env_id).join(".running");
let has_marker = running_file.exists();
let allow_marker_pid_fallback =
normalized.runtime_backend == "namespace" || normalized.runtime_backend == "mock";
let marker_pid = if has_marker && allow_marker_pid_fallback {
match std::fs::read_to_string(&running_file) {
Ok(s) => {
let pid_str = s.trim();
let pid = pid_str.parse::<u32>().ok();
if let Some(p) = pid {
if Path::new(&format!("/proc/{p}")).exists() {
Some(p)
} else {
let _ = std::fs::remove_file(&running_file);
None
}
} else {
if !pid_str.is_empty() {
let _ = std::fs::remove_file(&running_file);
}
None
}
}
Err(_) => None,
}
} else {
None
};
let status = backend.status(env_id);
let (runtime_running, runtime_pid) = match status {
Ok(s) => (s.running, s.pid),
Err(e) => {
if meta.state == EnvState::Running || has_marker {
return Err(e.into());
}
(false, None)
}
};
if meta.state != EnvState::Running && !runtime_running && marker_pid.is_none() {
return Err(CoreError::Runtime(
karapace_runtime::RuntimeError::NotRunning(format!(
"{} (state: {})",
@ -507,12 +558,9 @@ impl Engine {
));
}
let normalized = self.load_manifest(&meta.manifest_hash)?;
let store_str = self.store_root_str.clone();
let backend = select_backend(&normalized.runtime_backend, &store_str)?;
let status = backend.status(env_id)?;
let pid_to_kill = runtime_pid.or(marker_pid);
if let Some(pid) = status.pid {
if let Some(pid) = pid_to_kill {
let pid_i32 = i32::try_from(pid).map_err(|_| {
CoreError::Runtime(karapace_runtime::RuntimeError::ExecFailed(format!(
"invalid pid {pid}: exceeds i32 range"
@ -556,7 +604,7 @@ impl Engine {
let running_file = self.layout.env_path(env_id).join(".running");
let _ = std::fs::remove_file(running_file);
self.meta_store.update_state(env_id, EnvState::Built)?;
self.meta_store.update_state(env_id, target_state)?;
Ok(())
}

View file

@ -1578,6 +1578,93 @@ fn stop_non_running_env_returns_error() {
);
}
#[test]
fn stop_succeeds_when_metadata_stale_but_running_marker_exists() {
let store = tempfile::tempdir().unwrap();
let project = tempfile::tempdir().unwrap();
let engine = Engine::new(store.path());
let manifest = write_manifest(project.path(), &mock_manifest(&["git"]));
let r = engine.build(&manifest).unwrap();
let env_id = r.identity.env_id.to_string();
// Spawn a real sleep process and wait on it later to avoid zombie
let mut child = std::process::Command::new("sleep")
.arg("60")
.spawn()
.expect("spawn sleep");
let pid = child.id();
let pid_i32 = i32::try_from(pid).expect("pid fits in i32");
// Leave metadata state as Built but write .running marker with real PID.
let env_dir = store.path().join("env").join(&env_id);
fs::create_dir_all(&env_dir).unwrap();
fs::write(env_dir.join(".running"), pid.to_string()).unwrap();
let result = engine.stop(&env_id);
// Clean up the real process regardless of result
unsafe {
libc::kill(pid_i32, libc::SIGKILL);
}
let _ = child.wait();
assert!(
result.is_ok(),
"stop must succeed when .running exists even if metadata is stale: {:?}",
result.err()
);
// Verify state remains Built
let meta = engine.inspect(&env_id).unwrap();
assert_eq!(meta.state, EnvState::Built);
}
#[test]
fn stop_preserves_frozen_state() {
let store = tempfile::tempdir().unwrap();
let project = tempfile::tempdir().unwrap();
let engine = Engine::new(store.path());
let manifest = write_manifest(project.path(), &mock_manifest(&["git"]));
let r = engine.build(&manifest).unwrap();
let env_id = r.identity.env_id.to_string();
// Spawn a real sleep process and wait on it later to avoid zombie
let mut child = std::process::Command::new("sleep")
.arg("60")
.spawn()
.expect("spawn sleep");
let pid = child.id();
let pid_i32 = i32::try_from(pid).expect("pid fits in i32");
// Set metadata to Frozen and write .running marker with real PID.
let layout = StoreLayout::new(store.path());
let meta_store = karapace_store::MetadataStore::new(layout.clone());
meta_store.update_state(&env_id, EnvState::Frozen).unwrap();
let env_dir = store.path().join("env").join(&env_id);
fs::create_dir_all(&env_dir).unwrap();
fs::write(env_dir.join(".running"), pid.to_string()).unwrap();
let result = engine.stop(&env_id);
// Clean up the real process regardless of result
unsafe {
libc::kill(pid_i32, libc::SIGKILL);
}
let _ = child.wait();
assert!(
result.is_ok(),
"stop must succeed even if env is frozen but still running: {:?}",
result.err()
);
let meta = engine.inspect(&env_id).unwrap();
assert_eq!(meta.state, EnvState::Frozen);
}
#[test]
fn stale_running_marker_cleaned_on_engine_new() {
let store = tempfile::tempdir().unwrap();