mirror of
https://github.com/marcoallegretti/karapace.git
synced 2026-03-27 05:53:10 +00:00
fix(core): make stop robust to stale state
- Stop can now succeed when metadata is stale but runtime indicates running\n- Only uses .running PID fallback for namespace/mock; avoids OCI PID misuse\n- Preserves non-Running states (e.g. Frozen) after stopping\n- Adds integration tests for stale metadata and Frozen stop
This commit is contained in:
parent
598d07239e
commit
d78a770502
2 changed files with 142 additions and 7 deletions
|
|
@ -498,7 +498,58 @@ impl Engine {
|
|||
.get(env_id)
|
||||
.map_err(|_| CoreError::EnvNotFound(env_id.to_owned()))?;
|
||||
|
||||
if meta.state != EnvState::Running {
|
||||
let target_state = if meta.state == EnvState::Running {
|
||||
EnvState::Built
|
||||
} else {
|
||||
meta.state
|
||||
};
|
||||
|
||||
let normalized = self.load_manifest(&meta.manifest_hash)?;
|
||||
let store_str = self.store_root_str.clone();
|
||||
let backend = select_backend(&normalized.runtime_backend, &store_str)?;
|
||||
|
||||
let running_file = self.layout.env_path(env_id).join(".running");
|
||||
let has_marker = running_file.exists();
|
||||
let allow_marker_pid_fallback =
|
||||
normalized.runtime_backend == "namespace" || normalized.runtime_backend == "mock";
|
||||
|
||||
let marker_pid = if has_marker && allow_marker_pid_fallback {
|
||||
match std::fs::read_to_string(&running_file) {
|
||||
Ok(s) => {
|
||||
let pid_str = s.trim();
|
||||
let pid = pid_str.parse::<u32>().ok();
|
||||
if let Some(p) = pid {
|
||||
if Path::new(&format!("/proc/{p}")).exists() {
|
||||
Some(p)
|
||||
} else {
|
||||
let _ = std::fs::remove_file(&running_file);
|
||||
None
|
||||
}
|
||||
} else {
|
||||
if !pid_str.is_empty() {
|
||||
let _ = std::fs::remove_file(&running_file);
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let status = backend.status(env_id);
|
||||
let (runtime_running, runtime_pid) = match status {
|
||||
Ok(s) => (s.running, s.pid),
|
||||
Err(e) => {
|
||||
if meta.state == EnvState::Running || has_marker {
|
||||
return Err(e.into());
|
||||
}
|
||||
(false, None)
|
||||
}
|
||||
};
|
||||
|
||||
if meta.state != EnvState::Running && !runtime_running && marker_pid.is_none() {
|
||||
return Err(CoreError::Runtime(
|
||||
karapace_runtime::RuntimeError::NotRunning(format!(
|
||||
"{} (state: {})",
|
||||
|
|
@ -507,12 +558,9 @@ impl Engine {
|
|||
));
|
||||
}
|
||||
|
||||
let normalized = self.load_manifest(&meta.manifest_hash)?;
|
||||
let store_str = self.store_root_str.clone();
|
||||
let backend = select_backend(&normalized.runtime_backend, &store_str)?;
|
||||
let status = backend.status(env_id)?;
|
||||
let pid_to_kill = runtime_pid.or(marker_pid);
|
||||
|
||||
if let Some(pid) = status.pid {
|
||||
if let Some(pid) = pid_to_kill {
|
||||
let pid_i32 = i32::try_from(pid).map_err(|_| {
|
||||
CoreError::Runtime(karapace_runtime::RuntimeError::ExecFailed(format!(
|
||||
"invalid pid {pid}: exceeds i32 range"
|
||||
|
|
@ -556,7 +604,7 @@ impl Engine {
|
|||
let running_file = self.layout.env_path(env_id).join(".running");
|
||||
let _ = std::fs::remove_file(running_file);
|
||||
|
||||
self.meta_store.update_state(env_id, EnvState::Built)?;
|
||||
self.meta_store.update_state(env_id, target_state)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1578,6 +1578,93 @@ fn stop_non_running_env_returns_error() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stop_succeeds_when_metadata_stale_but_running_marker_exists() {
|
||||
let store = tempfile::tempdir().unwrap();
|
||||
let project = tempfile::tempdir().unwrap();
|
||||
let engine = Engine::new(store.path());
|
||||
|
||||
let manifest = write_manifest(project.path(), &mock_manifest(&["git"]));
|
||||
let r = engine.build(&manifest).unwrap();
|
||||
let env_id = r.identity.env_id.to_string();
|
||||
|
||||
// Spawn a real sleep process and wait on it later to avoid zombie
|
||||
let mut child = std::process::Command::new("sleep")
|
||||
.arg("60")
|
||||
.spawn()
|
||||
.expect("spawn sleep");
|
||||
let pid = child.id();
|
||||
let pid_i32 = i32::try_from(pid).expect("pid fits in i32");
|
||||
|
||||
// Leave metadata state as Built but write .running marker with real PID.
|
||||
let env_dir = store.path().join("env").join(&env_id);
|
||||
fs::create_dir_all(&env_dir).unwrap();
|
||||
fs::write(env_dir.join(".running"), pid.to_string()).unwrap();
|
||||
|
||||
let result = engine.stop(&env_id);
|
||||
|
||||
// Clean up the real process regardless of result
|
||||
unsafe {
|
||||
libc::kill(pid_i32, libc::SIGKILL);
|
||||
}
|
||||
let _ = child.wait();
|
||||
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"stop must succeed when .running exists even if metadata is stale: {:?}",
|
||||
result.err()
|
||||
);
|
||||
|
||||
// Verify state remains Built
|
||||
let meta = engine.inspect(&env_id).unwrap();
|
||||
assert_eq!(meta.state, EnvState::Built);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stop_preserves_frozen_state() {
|
||||
let store = tempfile::tempdir().unwrap();
|
||||
let project = tempfile::tempdir().unwrap();
|
||||
let engine = Engine::new(store.path());
|
||||
|
||||
let manifest = write_manifest(project.path(), &mock_manifest(&["git"]));
|
||||
let r = engine.build(&manifest).unwrap();
|
||||
let env_id = r.identity.env_id.to_string();
|
||||
|
||||
// Spawn a real sleep process and wait on it later to avoid zombie
|
||||
let mut child = std::process::Command::new("sleep")
|
||||
.arg("60")
|
||||
.spawn()
|
||||
.expect("spawn sleep");
|
||||
let pid = child.id();
|
||||
let pid_i32 = i32::try_from(pid).expect("pid fits in i32");
|
||||
|
||||
// Set metadata to Frozen and write .running marker with real PID.
|
||||
let layout = StoreLayout::new(store.path());
|
||||
let meta_store = karapace_store::MetadataStore::new(layout.clone());
|
||||
meta_store.update_state(&env_id, EnvState::Frozen).unwrap();
|
||||
|
||||
let env_dir = store.path().join("env").join(&env_id);
|
||||
fs::create_dir_all(&env_dir).unwrap();
|
||||
fs::write(env_dir.join(".running"), pid.to_string()).unwrap();
|
||||
|
||||
let result = engine.stop(&env_id);
|
||||
|
||||
// Clean up the real process regardless of result
|
||||
unsafe {
|
||||
libc::kill(pid_i32, libc::SIGKILL);
|
||||
}
|
||||
let _ = child.wait();
|
||||
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"stop must succeed even if env is frozen but still running: {:?}",
|
||||
result.err()
|
||||
);
|
||||
|
||||
let meta = engine.inspect(&env_id).unwrap();
|
||||
assert_eq!(meta.state, EnvState::Frozen);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stale_running_marker_cleaned_on_engine_new() {
|
||||
let store = tempfile::tempdir().unwrap();
|
||||
|
|
|
|||
Loading…
Reference in a new issue