fix: devil's advocate R3 — reap PID-suffixed leftovers from dead processes
R3 final pass surfaced one issue, addressed:
R2#2 introduced PID-suffixed <target>.{pid}.tmp / .{pid}.partN to
prevent concurrent agnes pull invocations from yanking each other's
in-progress writes. The pre-clean inside _download_chunked /
_download_single_stream only deletes leftovers from the CURRENT
process's PID — files from a SIGKILL'd or crashed prior pull (any
other PID) are never touched and accumulate on disk forever.
Add _reap_dead_pid_leftovers(target_path) called at the start of both
download paths. Globs <target>.*.tmp / <target>.*.partN, extracts the
embedded PID, calls os.kill(pid, 0) to test liveness (POSIX standard
no-op probe), and unlinks files whose process no longer exists.
Permission-denied = process is alive but owned by another user → keep
the file (conservative). Windows users get the conservative 'keep'
default.
Two new tests pin the behavior — live-PID file preserved, dead-PID
.tmp + .partN reaped, bare-name (legacy) untouched, garbage filenames
skipped without raise.
This commit is contained in:
parent
aee585fac6
commit
77d88014df
2 changed files with 113 additions and 0 deletions
|
|
@ -1,7 +1,9 @@
|
|||
"""HTTP client wrapper for CLI — handles auth, retries, streaming."""
|
||||
|
||||
import atexit
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
|
|
@ -14,6 +16,60 @@ import httpx
|
|||
|
||||
from cli.config import _config_dir, get_server_url, get_token
|
||||
|
||||
|
||||
# PID-suffixed tmp / part files — see `_download_chunked` and
|
||||
# `_download_single_stream`. We extract the embedded PID and reap any
|
||||
# leftover whose process is no longer alive on every pull. Without this,
|
||||
# every SIGKILL'd pull leaks files indefinitely (devil's-advocate R3
|
||||
# finding #1).
|
||||
_PID_SUFFIX_RE = re.compile(r"\.(\d+)\.(?:tmp|part\d+)$")
|
||||
|
||||
|
||||
def _is_pid_alive(pid: int) -> bool:
|
||||
"""Return True if a process with the given PID exists. POSIX-only;
|
||||
Windows users get the conservative `True` (file kept) which means
|
||||
no reaping but also no false-deletion of a live sibling."""
|
||||
if pid <= 0:
|
||||
return False
|
||||
try:
|
||||
# Signal 0 = no-op kill; raises ProcessLookupError when PID is
|
||||
# gone, PermissionError when the PID exists but isn't ours
|
||||
# (still alive, just owned by someone else — keep the file).
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
except Exception:
|
||||
# Anything else (e.g. AttributeError on Windows where os.kill
|
||||
# exists but signal 0 isn't supported the same way): be
|
||||
# conservative and don't reap.
|
||||
return True
|
||||
|
||||
|
||||
def _reap_dead_pid_leftovers(target_path: str) -> None:
|
||||
"""Remove `<target>.{pid}.tmp` and `<target>.{pid}.partN` files
|
||||
whose embedded PID is no longer alive. Called at the start of every
|
||||
download to keep the parquet directory tidy across SIGKILL'd or
|
||||
crashed prior runs. Never raises — leaked file is preferable to
|
||||
failing the new pull on a permission error."""
|
||||
candidates = glob.glob(f"{target_path}.*.tmp") + glob.glob(f"{target_path}.*.part*")
|
||||
for path in candidates:
|
||||
m = _PID_SUFFIX_RE.search(path)
|
||||
if not m:
|
||||
continue
|
||||
try:
|
||||
pid = int(m.group(1))
|
||||
except ValueError:
|
||||
continue
|
||||
if _is_pid_alive(pid):
|
||||
continue
|
||||
try:
|
||||
os.unlink(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Retry policy for transient failures during stream downloads. Scoped to
|
||||
# network issues and 5xx — 4xx (auth, 404, 400) is NOT retried. Tunable via
|
||||
# env for tests; defaults sit in the "one flaky network blip" window.
|
||||
|
|
@ -425,6 +481,10 @@ def _download_chunked(
|
|||
caller's `<target>.tmp` and renamed atomically.
|
||||
"""
|
||||
target = Path(target_path)
|
||||
# Reap leftovers from previously SIGKILL'd / crashed pulls before we
|
||||
# start writing — without this, PID-suffixed files from dead PIDs
|
||||
# accumulate forever on disk (devil's-advocate R3 finding #1).
|
||||
_reap_dead_pid_leftovers(target_path)
|
||||
# Per-process tmp + part suffixes (devil's-advocate R2 finding #2):
|
||||
# if two `agnes pull` invocations target the same parquet
|
||||
# concurrently (e.g. SessionStart hook + manual run, or two
|
||||
|
|
@ -520,6 +580,9 @@ def _download_single_stream(
|
|||
) -> int:
|
||||
"""Original single-stream path with retry. Used when chunking is
|
||||
disabled (small file, no range support, or fallback after 200-on-Range)."""
|
||||
# Same dead-PID reap as `_download_chunked` so leftovers from
|
||||
# crashed prior pulls don't accumulate indefinitely.
|
||||
_reap_dead_pid_leftovers(target_path)
|
||||
# Per-process tmp suffix — same rationale as `_download_chunked`
|
||||
# (devil's-advocate R2 finding #2): concurrent `agnes pull`
|
||||
# invocations against the same target dir must not yank each
|
||||
|
|
|
|||
|
|
@ -347,3 +347,53 @@ def test_progress_callback_aggregates_across_chunks(tmp_path, monkeypatch):
|
|||
stream_download("/api/data/x/download", str(target),
|
||||
progress_callback=lambda n: advances.append(n))
|
||||
assert sum(advances) == len(body)
|
||||
|
||||
|
||||
def test_dead_pid_leftovers_are_reaped(tmp_path, monkeypatch):
|
||||
"""Devil's-advocate R3 finding #1: PID-suffixed `<target>.{pid}.tmp`
|
||||
and `.partN` files from a SIGKILL'd previous pull must be reaped on
|
||||
the next pull, otherwise they accumulate on disk indefinitely.
|
||||
|
||||
PID 1 (init) is always alive, so a file with pid=1 must NOT be
|
||||
reaped. PID 99999999 (~10⁸) is essentially guaranteed not-alive on
|
||||
any modern Linux/macOS — used as the dead-PID marker.
|
||||
"""
|
||||
target = tmp_path / "out.bin"
|
||||
|
||||
# Live-PID leftover (pid=1 = init, always alive). Must NOT be reaped.
|
||||
live_path = tmp_path / "out.bin.1.tmp"
|
||||
live_path.write_bytes(b"live process leftover")
|
||||
|
||||
# Dead-PID leftovers — both .tmp and .part0 forms.
|
||||
dead_tmp = tmp_path / "out.bin.99999999.tmp"
|
||||
dead_tmp.write_bytes(b"dead process leftover tmp")
|
||||
dead_part = tmp_path / "out.bin.99999999.part0"
|
||||
dead_part.write_bytes(b"dead process leftover part")
|
||||
|
||||
# Bare-name leftover (no PID suffix) — pre-existing pattern, NOT
|
||||
# touched by the new reaper. Reaper only matches `.{digits}.tmp`
|
||||
# / `.{digits}.partN` exactly.
|
||||
bare_tmp = tmp_path / "out.bin.tmp"
|
||||
bare_tmp.write_bytes(b"bare leftover")
|
||||
|
||||
from cli.client import _reap_dead_pid_leftovers
|
||||
_reap_dead_pid_leftovers(str(target))
|
||||
|
||||
assert live_path.exists(), "live-PID leftover must be preserved"
|
||||
assert not dead_tmp.exists(), "dead-PID .tmp must be reaped"
|
||||
assert not dead_part.exists(), "dead-PID .partN must be reaped"
|
||||
assert bare_tmp.exists(), "bare-name leftover is out of scope for the reaper"
|
||||
|
||||
|
||||
def test_reap_handles_garbage_in_filename(tmp_path):
|
||||
"""Files in the parquet dir whose names happen to glob-match but
|
||||
don't conform to the PID-suffix shape must be skipped without
|
||||
raising."""
|
||||
target = tmp_path / "out.bin"
|
||||
weird = tmp_path / "out.bin.garbage.tmp"
|
||||
weird.write_bytes(b"x")
|
||||
|
||||
from cli.client import _reap_dead_pid_leftovers
|
||||
# Must not raise even though the filename has no integer PID.
|
||||
_reap_dead_pid_leftovers(str(target))
|
||||
assert weird.exists(), "non-PID-shaped file must not be reaped"
|
||||
|
|
|
|||
Loading…
Reference in a new issue