fix: devil's advocate R3 — reap PID-suffixed leftovers from dead processes

R3 final pass surfaced one issue, addressed:

R2#2 introduced PID-suffixed <target>.{pid}.tmp / .{pid}.partN to
prevent concurrent agnes pull invocations from yanking each other's
in-progress writes. The pre-clean inside _download_chunked /
_download_single_stream only deletes leftovers from the CURRENT
process's PID — files from a SIGKILL'd or crashed prior pull (any
other PID) are never touched and accumulate on disk forever.

Add _reap_dead_pid_leftovers(target_path) called at the start of both
download paths. Globs <target>.*.tmp / <target>.*.partN, extracts the
embedded PID, calls os.kill(pid, 0) to test liveness (POSIX standard
no-op probe), and unlinks files whose process no longer exists.
Permission-denied = process is alive but owned by another user → keep
the file (conservative). Windows users get the conservative 'keep'
default.

Two new tests pin the behavior — live-PID file preserved, dead-PID
.tmp + .partN reaped, bare-name (legacy) untouched, garbage filenames
skipped without raise.
This commit is contained in:
ZdenekSrotyr 2026-05-06 14:04:47 +02:00
parent aee585fac6
commit 77d88014df
2 changed files with 113 additions and 0 deletions

View file

@ -1,7 +1,9 @@
"""HTTP client wrapper for CLI — handles auth, retries, streaming.""" """HTTP client wrapper for CLI — handles auth, retries, streaming."""
import atexit import atexit
import glob
import os import os
import re
import threading import threading
import time import time
import traceback import traceback
@ -14,6 +16,60 @@ import httpx
from cli.config import _config_dir, get_server_url, get_token from cli.config import _config_dir, get_server_url, get_token
# PID-suffixed tmp / part files — see `_download_chunked` and
# `_download_single_stream`. We extract the embedded PID and reap any
# leftover whose process is no longer alive on every pull. Without this,
# every SIGKILL'd pull leaks files indefinitely (devil's-advocate R3
# finding #1).
_PID_SUFFIX_RE = re.compile(r"\.(\d+)\.(?:tmp|part\d+)$")
def _is_pid_alive(pid: int) -> bool:
"""Return True if a process with the given PID exists. POSIX-only;
Windows users get the conservative `True` (file kept) which means
no reaping but also no false-deletion of a live sibling."""
if pid <= 0:
return False
try:
# Signal 0 = no-op kill; raises ProcessLookupError when PID is
# gone, PermissionError when the PID exists but isn't ours
# (still alive, just owned by someone else — keep the file).
os.kill(pid, 0)
return True
except ProcessLookupError:
return False
except PermissionError:
return True
except Exception:
# Anything else (e.g. AttributeError on Windows where os.kill
# exists but signal 0 isn't supported the same way): be
# conservative and don't reap.
return True
def _reap_dead_pid_leftovers(target_path: str) -> None:
"""Remove `<target>.{pid}.tmp` and `<target>.{pid}.partN` files
whose embedded PID is no longer alive. Called at the start of every
download to keep the parquet directory tidy across SIGKILL'd or
crashed prior runs. Never raises leaked file is preferable to
failing the new pull on a permission error."""
candidates = glob.glob(f"{target_path}.*.tmp") + glob.glob(f"{target_path}.*.part*")
for path in candidates:
m = _PID_SUFFIX_RE.search(path)
if not m:
continue
try:
pid = int(m.group(1))
except ValueError:
continue
if _is_pid_alive(pid):
continue
try:
os.unlink(path)
except OSError:
pass
# Retry policy for transient failures during stream downloads. Scoped to # Retry policy for transient failures during stream downloads. Scoped to
# network issues and 5xx — 4xx (auth, 404, 400) is NOT retried. Tunable via # network issues and 5xx — 4xx (auth, 404, 400) is NOT retried. Tunable via
# env for tests; defaults sit in the "one flaky network blip" window. # env for tests; defaults sit in the "one flaky network blip" window.
@ -425,6 +481,10 @@ def _download_chunked(
caller's `<target>.tmp` and renamed atomically. caller's `<target>.tmp` and renamed atomically.
""" """
target = Path(target_path) target = Path(target_path)
# Reap leftovers from previously SIGKILL'd / crashed pulls before we
# start writing — without this, PID-suffixed files from dead PIDs
# accumulate forever on disk (devil's-advocate R3 finding #1).
_reap_dead_pid_leftovers(target_path)
# Per-process tmp + part suffixes (devil's-advocate R2 finding #2): # Per-process tmp + part suffixes (devil's-advocate R2 finding #2):
# if two `agnes pull` invocations target the same parquet # if two `agnes pull` invocations target the same parquet
# concurrently (e.g. SessionStart hook + manual run, or two # concurrently (e.g. SessionStart hook + manual run, or two
@ -520,6 +580,9 @@ def _download_single_stream(
) -> int: ) -> int:
"""Original single-stream path with retry. Used when chunking is """Original single-stream path with retry. Used when chunking is
disabled (small file, no range support, or fallback after 200-on-Range).""" disabled (small file, no range support, or fallback after 200-on-Range)."""
# Same dead-PID reap as `_download_chunked` so leftovers from
# crashed prior pulls don't accumulate indefinitely.
_reap_dead_pid_leftovers(target_path)
# Per-process tmp suffix — same rationale as `_download_chunked` # Per-process tmp suffix — same rationale as `_download_chunked`
# (devil's-advocate R2 finding #2): concurrent `agnes pull` # (devil's-advocate R2 finding #2): concurrent `agnes pull`
# invocations against the same target dir must not yank each # invocations against the same target dir must not yank each

View file

@ -347,3 +347,53 @@ def test_progress_callback_aggregates_across_chunks(tmp_path, monkeypatch):
stream_download("/api/data/x/download", str(target), stream_download("/api/data/x/download", str(target),
progress_callback=lambda n: advances.append(n)) progress_callback=lambda n: advances.append(n))
assert sum(advances) == len(body) assert sum(advances) == len(body)
def test_dead_pid_leftovers_are_reaped(tmp_path, monkeypatch):
"""Devil's-advocate R3 finding #1: PID-suffixed `<target>.{pid}.tmp`
and `.partN` files from a SIGKILL'd previous pull must be reaped on
the next pull, otherwise they accumulate on disk indefinitely.
PID 1 (init) is always alive, so a file with pid=1 must NOT be
reaped. PID 99999999 (~10) is essentially guaranteed not-alive on
any modern Linux/macOS used as the dead-PID marker.
"""
target = tmp_path / "out.bin"
# Live-PID leftover (pid=1 = init, always alive). Must NOT be reaped.
live_path = tmp_path / "out.bin.1.tmp"
live_path.write_bytes(b"live process leftover")
# Dead-PID leftovers — both .tmp and .part0 forms.
dead_tmp = tmp_path / "out.bin.99999999.tmp"
dead_tmp.write_bytes(b"dead process leftover tmp")
dead_part = tmp_path / "out.bin.99999999.part0"
dead_part.write_bytes(b"dead process leftover part")
# Bare-name leftover (no PID suffix) — pre-existing pattern, NOT
# touched by the new reaper. Reaper only matches `.{digits}.tmp`
# / `.{digits}.partN` exactly.
bare_tmp = tmp_path / "out.bin.tmp"
bare_tmp.write_bytes(b"bare leftover")
from cli.client import _reap_dead_pid_leftovers
_reap_dead_pid_leftovers(str(target))
assert live_path.exists(), "live-PID leftover must be preserved"
assert not dead_tmp.exists(), "dead-PID .tmp must be reaped"
assert not dead_part.exists(), "dead-PID .partN must be reaped"
assert bare_tmp.exists(), "bare-name leftover is out of scope for the reaper"
def test_reap_handles_garbage_in_filename(tmp_path):
"""Files in the parquet dir whose names happen to glob-match but
don't conform to the PID-suffix shape must be skipped without
raising."""
target = tmp_path / "out.bin"
weird = tmp_path / "out.bin.garbage.tmp"
weird.write_bytes(b"x")
from cli.client import _reap_dead_pid_leftovers
# Must not raise even though the filename has no integer PID.
_reap_dead_pid_leftovers(str(target))
assert weird.exists(), "non-PID-shaped file must not be reaped"