From 77d88014df9a9926e08b0fd17b0c866170f689c1 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Wed, 6 May 2026 14:04:47 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20devil's=20advocate=20R3=20=E2=80=94=20re?= =?UTF-8?q?ap=20PID-suffixed=20leftovers=20from=20dead=20processes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R3 final pass surfaced one issue, addressed: R2#2 introduced PID-suffixed .{pid}.tmp / .{pid}.partN to prevent concurrent agnes pull invocations from yanking each other's in-progress writes. The pre-clean inside _download_chunked / _download_single_stream only deletes leftovers from the CURRENT process's PID — files from a SIGKILL'd or crashed prior pull (any other PID) are never touched and accumulate on disk forever. Add _reap_dead_pid_leftovers(target_path) called at the start of both download paths. Globs .*.tmp / .*.partN, extracts the embedded PID, calls os.kill(pid, 0) to test liveness (POSIX standard no-op probe), and unlinks files whose process no longer exists. Permission-denied = process is alive but owned by another user → keep the file (conservative). Windows users get the conservative 'keep' default. Two new tests pin the behavior — live-PID file preserved, dead-PID .tmp + .partN reaped, bare-name (legacy) untouched, garbage filenames skipped without raise. --- cli/client.py | 63 ++++++++++++++++++++++++++++++++++++++ tests/test_pull_chunked.py | 50 ++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/cli/client.py b/cli/client.py index 257efa7..069ce3d 100644 --- a/cli/client.py +++ b/cli/client.py @@ -1,7 +1,9 @@ """HTTP client wrapper for CLI — handles auth, retries, streaming.""" import atexit +import glob import os +import re import threading import time import traceback @@ -14,6 +16,60 @@ import httpx from cli.config import _config_dir, get_server_url, get_token + +# PID-suffixed tmp / part files — see `_download_chunked` and +# `_download_single_stream`. We extract the embedded PID and reap any +# leftover whose process is no longer alive on every pull. Without this, +# every SIGKILL'd pull leaks files indefinitely (devil's-advocate R3 +# finding #1). +_PID_SUFFIX_RE = re.compile(r"\.(\d+)\.(?:tmp|part\d+)$") + + +def _is_pid_alive(pid: int) -> bool: + """Return True if a process with the given PID exists. POSIX-only; + Windows users get the conservative `True` (file kept) which means + no reaping but also no false-deletion of a live sibling.""" + if pid <= 0: + return False + try: + # Signal 0 = no-op kill; raises ProcessLookupError when PID is + # gone, PermissionError when the PID exists but isn't ours + # (still alive, just owned by someone else — keep the file). + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + except Exception: + # Anything else (e.g. AttributeError on Windows where os.kill + # exists but signal 0 isn't supported the same way): be + # conservative and don't reap. + return True + + +def _reap_dead_pid_leftovers(target_path: str) -> None: + """Remove `.{pid}.tmp` and `.{pid}.partN` files + whose embedded PID is no longer alive. Called at the start of every + download to keep the parquet directory tidy across SIGKILL'd or + crashed prior runs. Never raises — leaked file is preferable to + failing the new pull on a permission error.""" + candidates = glob.glob(f"{target_path}.*.tmp") + glob.glob(f"{target_path}.*.part*") + for path in candidates: + m = _PID_SUFFIX_RE.search(path) + if not m: + continue + try: + pid = int(m.group(1)) + except ValueError: + continue + if _is_pid_alive(pid): + continue + try: + os.unlink(path) + except OSError: + pass + # Retry policy for transient failures during stream downloads. Scoped to # network issues and 5xx — 4xx (auth, 404, 400) is NOT retried. Tunable via # env for tests; defaults sit in the "one flaky network blip" window. @@ -425,6 +481,10 @@ def _download_chunked( caller's `.tmp` and renamed atomically. """ target = Path(target_path) + # Reap leftovers from previously SIGKILL'd / crashed pulls before we + # start writing — without this, PID-suffixed files from dead PIDs + # accumulate forever on disk (devil's-advocate R3 finding #1). + _reap_dead_pid_leftovers(target_path) # Per-process tmp + part suffixes (devil's-advocate R2 finding #2): # if two `agnes pull` invocations target the same parquet # concurrently (e.g. SessionStart hook + manual run, or two @@ -520,6 +580,9 @@ def _download_single_stream( ) -> int: """Original single-stream path with retry. Used when chunking is disabled (small file, no range support, or fallback after 200-on-Range).""" + # Same dead-PID reap as `_download_chunked` so leftovers from + # crashed prior pulls don't accumulate indefinitely. + _reap_dead_pid_leftovers(target_path) # Per-process tmp suffix — same rationale as `_download_chunked` # (devil's-advocate R2 finding #2): concurrent `agnes pull` # invocations against the same target dir must not yank each diff --git a/tests/test_pull_chunked.py b/tests/test_pull_chunked.py index 3629688..87edd17 100644 --- a/tests/test_pull_chunked.py +++ b/tests/test_pull_chunked.py @@ -347,3 +347,53 @@ def test_progress_callback_aggregates_across_chunks(tmp_path, monkeypatch): stream_download("/api/data/x/download", str(target), progress_callback=lambda n: advances.append(n)) assert sum(advances) == len(body) + + +def test_dead_pid_leftovers_are_reaped(tmp_path, monkeypatch): + """Devil's-advocate R3 finding #1: PID-suffixed `.{pid}.tmp` + and `.partN` files from a SIGKILL'd previous pull must be reaped on + the next pull, otherwise they accumulate on disk indefinitely. + + PID 1 (init) is always alive, so a file with pid=1 must NOT be + reaped. PID 99999999 (~10⁸) is essentially guaranteed not-alive on + any modern Linux/macOS — used as the dead-PID marker. + """ + target = tmp_path / "out.bin" + + # Live-PID leftover (pid=1 = init, always alive). Must NOT be reaped. + live_path = tmp_path / "out.bin.1.tmp" + live_path.write_bytes(b"live process leftover") + + # Dead-PID leftovers — both .tmp and .part0 forms. + dead_tmp = tmp_path / "out.bin.99999999.tmp" + dead_tmp.write_bytes(b"dead process leftover tmp") + dead_part = tmp_path / "out.bin.99999999.part0" + dead_part.write_bytes(b"dead process leftover part") + + # Bare-name leftover (no PID suffix) — pre-existing pattern, NOT + # touched by the new reaper. Reaper only matches `.{digits}.tmp` + # / `.{digits}.partN` exactly. + bare_tmp = tmp_path / "out.bin.tmp" + bare_tmp.write_bytes(b"bare leftover") + + from cli.client import _reap_dead_pid_leftovers + _reap_dead_pid_leftovers(str(target)) + + assert live_path.exists(), "live-PID leftover must be preserved" + assert not dead_tmp.exists(), "dead-PID .tmp must be reaped" + assert not dead_part.exists(), "dead-PID .partN must be reaped" + assert bare_tmp.exists(), "bare-name leftover is out of scope for the reaper" + + +def test_reap_handles_garbage_in_filename(tmp_path): + """Files in the parquet dir whose names happen to glob-match but + don't conform to the PID-suffix shape must be skipped without + raising.""" + target = tmp_path / "out.bin" + weird = tmp_path / "out.bin.garbage.tmp" + weird.write_bytes(b"x") + + from cli.client import _reap_dead_pid_leftovers + # Must not raise even though the filename has no integer PID. + _reap_dead_pid_leftovers(str(target)) + assert weird.exists(), "non-PID-shaped file must not be reaped"