fix: devil's advocate R3 — reap PID-suffixed leftovers from dead processes
R3 final pass surfaced one issue, addressed:
R2#2 introduced PID-suffixed <target>.{pid}.tmp / .{pid}.partN to
prevent concurrent agnes pull invocations from yanking each other's
in-progress writes. The pre-clean inside _download_chunked /
_download_single_stream only deletes leftovers from the CURRENT
process's PID — files from a SIGKILL'd or crashed prior pull (any
other PID) are never touched and accumulate on disk forever.
Add _reap_dead_pid_leftovers(target_path) called at the start of both
download paths. Globs <target>.*.tmp / <target>.*.partN, extracts the
embedded PID, calls os.kill(pid, 0) to test liveness (POSIX standard
no-op probe), and unlinks files whose process no longer exists.
Permission-denied = process is alive but owned by another user → keep
the file (conservative). Windows users get the conservative 'keep'
default.
Two new tests pin the behavior — live-PID file preserved, dead-PID
.tmp + .partN reaped, bare-name (legacy) untouched, garbage filenames
skipped without raise.
This commit is contained in:
parent
aee585fac6
commit
77d88014df
2 changed files with 113 additions and 0 deletions
|
|
@ -1,7 +1,9 @@
|
||||||
"""HTTP client wrapper for CLI — handles auth, retries, streaming."""
|
"""HTTP client wrapper for CLI — handles auth, retries, streaming."""
|
||||||
|
|
||||||
import atexit
|
import atexit
|
||||||
|
import glob
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
|
@ -14,6 +16,60 @@ import httpx
|
||||||
|
|
||||||
from cli.config import _config_dir, get_server_url, get_token
|
from cli.config import _config_dir, get_server_url, get_token
|
||||||
|
|
||||||
|
|
||||||
|
# PID-suffixed tmp / part files — see `_download_chunked` and
|
||||||
|
# `_download_single_stream`. We extract the embedded PID and reap any
|
||||||
|
# leftover whose process is no longer alive on every pull. Without this,
|
||||||
|
# every SIGKILL'd pull leaks files indefinitely (devil's-advocate R3
|
||||||
|
# finding #1).
|
||||||
|
_PID_SUFFIX_RE = re.compile(r"\.(\d+)\.(?:tmp|part\d+)$")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_pid_alive(pid: int) -> bool:
|
||||||
|
"""Return True if a process with the given PID exists. POSIX-only;
|
||||||
|
Windows users get the conservative `True` (file kept) which means
|
||||||
|
no reaping but also no false-deletion of a live sibling."""
|
||||||
|
if pid <= 0:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
# Signal 0 = no-op kill; raises ProcessLookupError when PID is
|
||||||
|
# gone, PermissionError when the PID exists but isn't ours
|
||||||
|
# (still alive, just owned by someone else — keep the file).
|
||||||
|
os.kill(pid, 0)
|
||||||
|
return True
|
||||||
|
except ProcessLookupError:
|
||||||
|
return False
|
||||||
|
except PermissionError:
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
# Anything else (e.g. AttributeError on Windows where os.kill
|
||||||
|
# exists but signal 0 isn't supported the same way): be
|
||||||
|
# conservative and don't reap.
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _reap_dead_pid_leftovers(target_path: str) -> None:
|
||||||
|
"""Remove `<target>.{pid}.tmp` and `<target>.{pid}.partN` files
|
||||||
|
whose embedded PID is no longer alive. Called at the start of every
|
||||||
|
download to keep the parquet directory tidy across SIGKILL'd or
|
||||||
|
crashed prior runs. Never raises — leaked file is preferable to
|
||||||
|
failing the new pull on a permission error."""
|
||||||
|
candidates = glob.glob(f"{target_path}.*.tmp") + glob.glob(f"{target_path}.*.part*")
|
||||||
|
for path in candidates:
|
||||||
|
m = _PID_SUFFIX_RE.search(path)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
pid = int(m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if _is_pid_alive(pid):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
os.unlink(path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
# Retry policy for transient failures during stream downloads. Scoped to
|
# Retry policy for transient failures during stream downloads. Scoped to
|
||||||
# network issues and 5xx — 4xx (auth, 404, 400) is NOT retried. Tunable via
|
# network issues and 5xx — 4xx (auth, 404, 400) is NOT retried. Tunable via
|
||||||
# env for tests; defaults sit in the "one flaky network blip" window.
|
# env for tests; defaults sit in the "one flaky network blip" window.
|
||||||
|
|
@ -425,6 +481,10 @@ def _download_chunked(
|
||||||
caller's `<target>.tmp` and renamed atomically.
|
caller's `<target>.tmp` and renamed atomically.
|
||||||
"""
|
"""
|
||||||
target = Path(target_path)
|
target = Path(target_path)
|
||||||
|
# Reap leftovers from previously SIGKILL'd / crashed pulls before we
|
||||||
|
# start writing — without this, PID-suffixed files from dead PIDs
|
||||||
|
# accumulate forever on disk (devil's-advocate R3 finding #1).
|
||||||
|
_reap_dead_pid_leftovers(target_path)
|
||||||
# Per-process tmp + part suffixes (devil's-advocate R2 finding #2):
|
# Per-process tmp + part suffixes (devil's-advocate R2 finding #2):
|
||||||
# if two `agnes pull` invocations target the same parquet
|
# if two `agnes pull` invocations target the same parquet
|
||||||
# concurrently (e.g. SessionStart hook + manual run, or two
|
# concurrently (e.g. SessionStart hook + manual run, or two
|
||||||
|
|
@ -520,6 +580,9 @@ def _download_single_stream(
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Original single-stream path with retry. Used when chunking is
|
"""Original single-stream path with retry. Used when chunking is
|
||||||
disabled (small file, no range support, or fallback after 200-on-Range)."""
|
disabled (small file, no range support, or fallback after 200-on-Range)."""
|
||||||
|
# Same dead-PID reap as `_download_chunked` so leftovers from
|
||||||
|
# crashed prior pulls don't accumulate indefinitely.
|
||||||
|
_reap_dead_pid_leftovers(target_path)
|
||||||
# Per-process tmp suffix — same rationale as `_download_chunked`
|
# Per-process tmp suffix — same rationale as `_download_chunked`
|
||||||
# (devil's-advocate R2 finding #2): concurrent `agnes pull`
|
# (devil's-advocate R2 finding #2): concurrent `agnes pull`
|
||||||
# invocations against the same target dir must not yank each
|
# invocations against the same target dir must not yank each
|
||||||
|
|
|
||||||
|
|
@ -347,3 +347,53 @@ def test_progress_callback_aggregates_across_chunks(tmp_path, monkeypatch):
|
||||||
stream_download("/api/data/x/download", str(target),
|
stream_download("/api/data/x/download", str(target),
|
||||||
progress_callback=lambda n: advances.append(n))
|
progress_callback=lambda n: advances.append(n))
|
||||||
assert sum(advances) == len(body)
|
assert sum(advances) == len(body)
|
||||||
|
|
||||||
|
|
||||||
|
def test_dead_pid_leftovers_are_reaped(tmp_path, monkeypatch):
|
||||||
|
"""Devil's-advocate R3 finding #1: PID-suffixed `<target>.{pid}.tmp`
|
||||||
|
and `.partN` files from a SIGKILL'd previous pull must be reaped on
|
||||||
|
the next pull, otherwise they accumulate on disk indefinitely.
|
||||||
|
|
||||||
|
PID 1 (init) is always alive, so a file with pid=1 must NOT be
|
||||||
|
reaped. PID 99999999 (~10⁸) is essentially guaranteed not-alive on
|
||||||
|
any modern Linux/macOS — used as the dead-PID marker.
|
||||||
|
"""
|
||||||
|
target = tmp_path / "out.bin"
|
||||||
|
|
||||||
|
# Live-PID leftover (pid=1 = init, always alive). Must NOT be reaped.
|
||||||
|
live_path = tmp_path / "out.bin.1.tmp"
|
||||||
|
live_path.write_bytes(b"live process leftover")
|
||||||
|
|
||||||
|
# Dead-PID leftovers — both .tmp and .part0 forms.
|
||||||
|
dead_tmp = tmp_path / "out.bin.99999999.tmp"
|
||||||
|
dead_tmp.write_bytes(b"dead process leftover tmp")
|
||||||
|
dead_part = tmp_path / "out.bin.99999999.part0"
|
||||||
|
dead_part.write_bytes(b"dead process leftover part")
|
||||||
|
|
||||||
|
# Bare-name leftover (no PID suffix) — pre-existing pattern, NOT
|
||||||
|
# touched by the new reaper. Reaper only matches `.{digits}.tmp`
|
||||||
|
# / `.{digits}.partN` exactly.
|
||||||
|
bare_tmp = tmp_path / "out.bin.tmp"
|
||||||
|
bare_tmp.write_bytes(b"bare leftover")
|
||||||
|
|
||||||
|
from cli.client import _reap_dead_pid_leftovers
|
||||||
|
_reap_dead_pid_leftovers(str(target))
|
||||||
|
|
||||||
|
assert live_path.exists(), "live-PID leftover must be preserved"
|
||||||
|
assert not dead_tmp.exists(), "dead-PID .tmp must be reaped"
|
||||||
|
assert not dead_part.exists(), "dead-PID .partN must be reaped"
|
||||||
|
assert bare_tmp.exists(), "bare-name leftover is out of scope for the reaper"
|
||||||
|
|
||||||
|
|
||||||
|
def test_reap_handles_garbage_in_filename(tmp_path):
|
||||||
|
"""Files in the parquet dir whose names happen to glob-match but
|
||||||
|
don't conform to the PID-suffix shape must be skipped without
|
||||||
|
raising."""
|
||||||
|
target = tmp_path / "out.bin"
|
||||||
|
weird = tmp_path / "out.bin.garbage.tmp"
|
||||||
|
weird.write_bytes(b"x")
|
||||||
|
|
||||||
|
from cli.client import _reap_dead_pid_leftovers
|
||||||
|
# Must not raise even though the filename has no integer PID.
|
||||||
|
_reap_dead_pid_leftovers(str(target))
|
||||||
|
assert weird.exists(), "non-PID-shaped file must not be reaped"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue