R1 adversarial review surfaced 5 issues, all addressed: #1 chunked download silently disabled in non-Caddy deployments (HEAD on GET-only FastAPI route returns 405). _probe_range_support now falls back to GET with Range: bytes=0-0 when HEAD fails — works against both Caddy file_server (HEAD-friendly) and dev FastAPI direct (GET-only). #2 parse-error fallback heuristic too broad — matched on Unrecognized name / Function not found / No matching signature / Invalid cast, which BQ surfaces for ordinary user-column typos. That triggered slow ATTACH-catalog retry on every typo (2× latency tax). Narrowed to just 'Syntax error' / 'syntax error' which are the genuine DuckDB-vs-BQ dialect mismatch markers. #3 apply_bq_session_settings was only run on fresh-built pool entries, not on reuse. An operator's /admin/server-config change to bq_query _timeout_ms wouldn't propagate to long-lived pooled sessions until restart. Fixed: re-apply on every pool acquire (idempotent + fail-soft). #4 content-length sanity bound — a misconfigured proxy returning a wildly inflated Content-Length would cause overlapping chunked Range requests against the actual file → corrupt assembled output (caught by manifest hash check, but only after wasted bandwidth). Cap at 100 GiB; above that, drop to single-stream. #5 rewriter assumed every BQ row resolves under the single bq.projects.data project. Bucket containing '.' suggests a project- qualified bucket (multi-project deployment); rewriter would silently target the wrong project. Conservative skip with regression test.
640 lines
26 KiB
Python
640 lines
26 KiB
Python
"""HTTP client wrapper for CLI — handles auth, retries, streaming."""
|
||
|
||
import atexit
|
||
import os
|
||
import threading
|
||
import time
|
||
import traceback
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
import httpx
|
||
|
||
from cli.config import _config_dir, get_server_url, get_token
|
||
|
||
# Retry policy for transient failures during stream downloads. Scoped to
|
||
# network issues and 5xx — 4xx (auth, 404, 400) is NOT retried. Tunable via
|
||
# env for tests; defaults sit in the "one flaky network blip" window.
|
||
_RETRY_ATTEMPTS = int(os.environ.get("AGNES_STREAM_RETRIES", "3"))
|
||
_RETRY_BACKOFFS_S = (0.3, 1.0, 3.0) # seconds before attempt 2, 3, 4
|
||
|
||
# Long-running query timeout. /api/query forwards to BigQuery for remote
|
||
# tables, where SELECTs routinely run for minutes. The default 30s HTTP
|
||
# timeout dies long before BQ finishes. Operators tune via AGNES_QUERY_TIMEOUT.
|
||
QUERY_TIMEOUT_S = float(os.environ.get("AGNES_QUERY_TIMEOUT", "300"))
|
||
|
||
# Range-chunked parallel download — see `stream_download` docstring. Defaults
|
||
# tuned for the corp-VPN per-flow rate-limiting case (single-stream throttled
|
||
# but N parallel range requests scale linearly). Disabled implicitly for
|
||
# files below the threshold or when the server doesn't advertise byte-range
|
||
# support. Operators can hard-disable by setting parallelism to 1.
|
||
_CHUNK_PARALLELISM = max(1, min(16, int(
|
||
os.environ.get("AGNES_PULL_CHUNK_PARALLELISM", "4"),
|
||
)))
|
||
_CHUNK_THRESHOLD_BYTES = int(
|
||
os.environ.get("AGNES_PULL_CHUNK_THRESHOLD_BYTES", str(50 * 1024 * 1024)),
|
||
)
|
||
|
||
|
||
# ── Transport-error translation ─────────────────────────────────────────
|
||
# Pavel's Issue #185 Phase 3B caught the failure mode: when httpx raises
|
||
# `ReadTimeout` / `ConnectError` / `RemoteProtocolError` and the CLI
|
||
# command doesn't catch it, Typer dumps a five-frame Python traceback to
|
||
# the analyst's terminal. That looks like a CLI bug to a non-Python user
|
||
# and obscures the actionable signal ("server slow, try snapshot create").
|
||
# Translate transport exceptions to `AgnesTransportError` with a typed
|
||
# user-facing message, log the full traceback to `~/.config/agnes/last-
|
||
# error.log` for debug, and let the top-level CLI handler render the
|
||
# clean message + exit non-zero.
|
||
|
||
_LOG_FILE = _config_dir() / "last-error.log"
|
||
|
||
|
||
class AgnesTransportError(Exception):
|
||
"""Network / transport failure with a user-actionable message.
|
||
|
||
Raised by the api_* / stream_download helpers when httpx surfaces a
|
||
connection / timeout / protocol error. The CLI's top-level Typer
|
||
handler catches this, prints `.user_message` (NOT the traceback),
|
||
and exits non-zero. Full traceback goes to ``~/.config/agnes/last-
|
||
error.log`` so an operator can recover it for support.
|
||
"""
|
||
|
||
def __init__(self, user_message: str, *, hint: str = "", logfile_path: Path | None = None):
|
||
super().__init__(user_message)
|
||
self.user_message = user_message
|
||
self.hint = hint
|
||
self.logfile_path = logfile_path
|
||
|
||
|
||
def _log_traceback(exc: BaseException, *, context: str) -> Path:
|
||
"""Append a timestamped traceback to ``~/.config/agnes/last-error.log``
|
||
and return the path. Best-effort — never raises (a logging failure
|
||
must not mask the original error)."""
|
||
try:
|
||
with open(_LOG_FILE, "a", encoding="utf-8") as f:
|
||
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
f.write(f"\n=== {ts} {context} ===\n")
|
||
traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
|
||
except Exception:
|
||
pass
|
||
return _LOG_FILE
|
||
|
||
|
||
def _translate_transport_error(
|
||
exc: Exception, *, context: str, timeout_s: float | None = None,
|
||
) -> AgnesTransportError:
|
||
"""Map httpx transport exceptions to user-facing CLI messages. The
|
||
mapping is intentionally pragmatic — analysts care about "what do I
|
||
do next", not the gRPC / TCP detail.
|
||
|
||
`timeout_s`, when supplied, is the actual httpx timeout used by the
|
||
failing call so the ReadTimeout message reports the real wait window
|
||
(a `agnes catalog` GET dies at 30s, not 300s — Devin Review on PR
|
||
#188 caught the original signature hardcoding `QUERY_TIMEOUT_S`,
|
||
which only matches `agnes query --remote`)."""
|
||
log = _log_traceback(exc, context=context)
|
||
if isinstance(exc, httpx.ReadTimeout):
|
||
wait_s = timeout_s if timeout_s is not None else QUERY_TIMEOUT_S
|
||
# The "long-running BQ" advisory only makes sense when the call
|
||
# actually hit the query path (timeout ≥ ~60s). For short calls
|
||
# (the 30s default on `agnes catalog` etc.) it's just confusing.
|
||
if wait_s >= 60:
|
||
hint = (
|
||
"If this is `agnes query --remote` against a heavy BQ view, "
|
||
"the underlying BQ job took longer than the wait window. Try:\n"
|
||
" • narrow the WHERE (especially the partition column from `agnes catalog --json`)\n"
|
||
" • `agnes snapshot create <table> ... --estimate` to materialize once + query locally\n"
|
||
" • set AGNES_QUERY_TIMEOUT=600 for a longer client-side wait\n"
|
||
f"Full traceback: {log}"
|
||
)
|
||
else:
|
||
hint = (
|
||
"Server is slow or unreachable. Check `agnes status`; "
|
||
"re-run if transient.\n"
|
||
f"Full traceback: {log}"
|
||
)
|
||
return AgnesTransportError(
|
||
f"Server didn't respond within the read timeout ({wait_s:.0f}s) "
|
||
f"for {context}.",
|
||
hint=hint,
|
||
logfile_path=log,
|
||
)
|
||
if isinstance(exc, httpx.ConnectError):
|
||
return AgnesTransportError(
|
||
f"Can't reach the agnes server for {context}.",
|
||
hint=(
|
||
"Check the server URL with `agnes status`, network reachability "
|
||
"(VPN / DNS / firewall), and the TLS-trust setup if this is a "
|
||
f"corporate-CA deployment.\nFull traceback: {log}"
|
||
),
|
||
logfile_path=log,
|
||
)
|
||
if isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.WriteError)):
|
||
return AgnesTransportError(
|
||
f"Connection broke mid-flight on {context}.",
|
||
hint=(
|
||
"Usually a transient network blip. Re-run the command. If it "
|
||
f"keeps happening, check `agnes status`.\nFull traceback: {log}"
|
||
),
|
||
logfile_path=log,
|
||
)
|
||
if isinstance(exc, httpx.TimeoutException):
|
||
return AgnesTransportError(
|
||
f"Network timeout on {context}.",
|
||
hint=f"Re-run; if persistent, check the server.\nFull traceback: {log}",
|
||
logfile_path=log,
|
||
)
|
||
# Anything else: re-wrap with a generic message so the CLI doesn't
|
||
# dump the traceback. We'd prefer a typed translation; if you hit
|
||
# this branch, add a clause above.
|
||
return AgnesTransportError(
|
||
f"Unexpected error on {context}: {type(exc).__name__}.",
|
||
hint=f"Full traceback: {log}",
|
||
logfile_path=log,
|
||
)
|
||
|
||
|
||
def get_client(timeout: float = 30.0) -> httpx.Client:
|
||
"""Get an authenticated httpx client.
|
||
|
||
This factory creates a fresh client per call — used by the small
|
||
`api_*` helpers (one request, then close). The big-stream path
|
||
(`stream_download`) routes through `_get_shared_client()` to amortize
|
||
TLS handshakes and HTTP/2 multiplexing across N parquet downloads.
|
||
"""
|
||
token = get_token()
|
||
headers = {}
|
||
if token:
|
||
headers["Authorization"] = f"Bearer {token}"
|
||
return httpx.Client(
|
||
base_url=get_server_url(),
|
||
headers=headers,
|
||
timeout=timeout,
|
||
)
|
||
|
||
|
||
# ── Shared persistent client ────────────────────────────────────────────
|
||
# `agnes pull` issues N stream_download calls — one per parquet — plus
|
||
# (with chunked downloads) M Range requests per file. Without pooling,
|
||
# each call performs a fresh TLS handshake; with HTTP/2 enabled, all
|
||
# those requests multiplex over a single TCP connection. The shared
|
||
# client is created lazily on first stream-download request, kept alive
|
||
# for the duration of the process, and closed at exit.
|
||
#
|
||
# HTTP/2 requires the optional `h2` package. If it's unavailable (slim
|
||
# install), we fall back to HTTP/1.1 — pooling alone still saves the
|
||
# handshake cost — and never raise. The CLI must not crash on `agnes
|
||
# pull` because of an h2 import error.
|
||
|
||
_SHARED_CLIENT: Optional[httpx.Client] = None
|
||
_SHARED_CLIENT_LOCK = threading.Lock()
|
||
|
||
|
||
def _get_shared_client() -> httpx.Client:
|
||
"""Lazily create + return a process-wide httpx.Client.
|
||
|
||
Pool defaults: keep up to 32 keepalive connections (covers the
|
||
chunk-parallelism cap of 16 × 2 simultaneous files comfortably) and
|
||
cap the total at 64 so a runaway loop can't open thousands of
|
||
sockets. HTTP/2 is opt-in via httpx's `http2=True` and gracefully
|
||
degrades when the `h2` extra is missing.
|
||
"""
|
||
global _SHARED_CLIENT
|
||
with _SHARED_CLIENT_LOCK:
|
||
if _SHARED_CLIENT is not None:
|
||
return _SHARED_CLIENT
|
||
token = get_token()
|
||
headers = {}
|
||
if token:
|
||
headers["Authorization"] = f"Bearer {token}"
|
||
limits = httpx.Limits(
|
||
max_keepalive_connections=32,
|
||
max_connections=64,
|
||
)
|
||
try:
|
||
client = httpx.Client(
|
||
base_url=get_server_url(),
|
||
headers=headers,
|
||
timeout=300.0,
|
||
http2=True,
|
||
limits=limits,
|
||
)
|
||
except (ImportError, RuntimeError):
|
||
# `h2` not installed → httpx raises; fall back to HTTP/1.1.
|
||
# Pooling alone still amortizes the TLS handshake.
|
||
client = httpx.Client(
|
||
base_url=get_server_url(),
|
||
headers=headers,
|
||
timeout=300.0,
|
||
limits=limits,
|
||
)
|
||
_SHARED_CLIENT = client
|
||
return client
|
||
|
||
|
||
def _close_shared_client() -> None:
|
||
"""Close the shared client and clear the slot. Safe to call twice."""
|
||
global _SHARED_CLIENT
|
||
with _SHARED_CLIENT_LOCK:
|
||
if _SHARED_CLIENT is not None:
|
||
try:
|
||
_SHARED_CLIENT.close()
|
||
except Exception:
|
||
pass
|
||
_SHARED_CLIENT = None
|
||
|
||
|
||
atexit.register(_close_shared_client)
|
||
|
||
|
||
def api_get(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
|
||
try:
|
||
with get_client(timeout=timeout) as client:
|
||
return client.get(path, **kwargs)
|
||
except httpx.HTTPError as exc:
|
||
raise _translate_transport_error(exc, context=f"GET {path}", timeout_s=timeout) from exc
|
||
|
||
|
||
def api_post(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
|
||
try:
|
||
with get_client(timeout=timeout) as client:
|
||
return client.post(path, **kwargs)
|
||
except httpx.HTTPError as exc:
|
||
raise _translate_transport_error(exc, context=f"POST {path}", timeout_s=timeout) from exc
|
||
|
||
|
||
def api_delete(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
|
||
try:
|
||
with get_client(timeout=timeout) as client:
|
||
return client.delete(path, **kwargs)
|
||
except httpx.HTTPError as exc:
|
||
raise _translate_transport_error(exc, context=f"DELETE {path}", timeout_s=timeout) from exc
|
||
|
||
|
||
def api_patch(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
|
||
try:
|
||
with get_client(timeout=timeout) as client:
|
||
return client.patch(path, **kwargs)
|
||
except httpx.HTTPError as exc:
|
||
raise _translate_transport_error(exc, context=f"PATCH {path}", timeout_s=timeout) from exc
|
||
|
||
|
||
def _is_transient(exc: Exception) -> bool:
|
||
"""Worth retrying? Network blip or 5xx — yes. Auth / 4xx — no."""
|
||
if isinstance(exc, (httpx.ConnectError, httpx.ReadError, httpx.WriteError,
|
||
httpx.RemoteProtocolError, httpx.TimeoutException)):
|
||
return True
|
||
if isinstance(exc, httpx.HTTPStatusError):
|
||
return 500 <= exc.response.status_code < 600
|
||
return False
|
||
|
||
|
||
def _read_chunk_threshold_bytes() -> int:
|
||
"""Re-read threshold each call so tests / operators can flip it via
|
||
env var without restarting the process."""
|
||
try:
|
||
return int(os.environ.get(
|
||
"AGNES_PULL_CHUNK_THRESHOLD_BYTES", str(_CHUNK_THRESHOLD_BYTES),
|
||
))
|
||
except ValueError:
|
||
return _CHUNK_THRESHOLD_BYTES
|
||
|
||
|
||
def _read_chunk_parallelism() -> int:
|
||
"""Re-read parallelism each call (same rationale as threshold). Floor 1,
|
||
ceiling 16."""
|
||
try:
|
||
n = int(os.environ.get(
|
||
"AGNES_PULL_CHUNK_PARALLELISM", str(_CHUNK_PARALLELISM),
|
||
))
|
||
except ValueError:
|
||
n = _CHUNK_PARALLELISM
|
||
return max(1, min(16, n))
|
||
|
||
|
||
def _probe_range_support(client: httpx.Client, path: str) -> tuple[int, bool]:
|
||
"""Send HEAD; return (content-length, accepts-byte-ranges).
|
||
|
||
`(0, False)` means "we couldn't tell — fall back to single-stream".
|
||
Never raises; transport errors during the probe are treated as
|
||
"no chunking, try the GET instead and let it surface the failure
|
||
in the normal retry loop".
|
||
|
||
Probe order: HEAD first (cheap, idempotent), then GET-with-tiny-range
|
||
fallback. The HEAD path covers Caddy's `file_server` (which advertises
|
||
HEAD) and Caddy's `reverse_proxy` (which forwards HEAD upstream). The
|
||
GET-fallback covers the dev `docker compose up` deployment where
|
||
requests go straight to FastAPI's GET-only `/api/data/{tid}/download`
|
||
route — FastAPI returns **405 Method Not Allowed** to a HEAD on a
|
||
GET-only route, which without this fallback would silently disable
|
||
chunked download for every dev / non-TLS install. The GET-with-Range
|
||
probe asks for 1 byte so the server response is bounded; we discard
|
||
the body and read only the headers + status code.
|
||
"""
|
||
try:
|
||
resp = client.head(path)
|
||
status = getattr(resp, "status_code", 200)
|
||
if status < 400:
|
||
size = int(resp.headers.get("content-length", "0") or 0)
|
||
accepts = (resp.headers.get("accept-ranges", "").lower() == "bytes")
|
||
if size > 0:
|
||
return (size, accepts)
|
||
# HEAD failed (405 from GET-only route is the common case in
|
||
# non-Caddy deployments) or returned 0-length — fall through to
|
||
# the tiny-Range GET probe.
|
||
except Exception:
|
||
pass
|
||
try:
|
||
with client.stream("GET", path, headers={"Range": "bytes=0-0"}) as resp:
|
||
status = getattr(resp, "status_code", 0)
|
||
if status not in (200, 206):
|
||
return (0, False)
|
||
# Drain the 1-byte body so the connection is reusable.
|
||
for _ in resp.iter_bytes():
|
||
pass
|
||
# Content-Range on a 206 response carries the total: `bytes 0-0/12345`.
|
||
# On a 200 response the server didn't honor Range — content-length is the total.
|
||
if status == 206:
|
||
cr = resp.headers.get("content-range", "")
|
||
if "/" in cr:
|
||
try:
|
||
total = int(cr.rsplit("/", 1)[1])
|
||
return (total, True)
|
||
except ValueError:
|
||
return (0, False)
|
||
return (0, False)
|
||
# status == 200 → server ignored Range; we can read content-length but
|
||
# accept-ranges is False (or missing) so the caller will not chunk.
|
||
size = int(resp.headers.get("content-length", "0") or 0)
|
||
accepts = (resp.headers.get("accept-ranges", "").lower() == "bytes")
|
||
return (size, accepts)
|
||
except Exception:
|
||
return (0, False)
|
||
|
||
|
||
class _RangeNotHonored(Exception):
|
||
"""Internal sentinel — server returned 200 instead of 206 to a Range
|
||
request. Caller catches and falls back to the single-stream path."""
|
||
|
||
|
||
def _download_chunk(
|
||
client: httpx.Client,
|
||
path: str,
|
||
start: int,
|
||
end: int,
|
||
part_path: Path,
|
||
progress_callback,
|
||
) -> None:
|
||
"""Stream `bytes=start-end` to `part_path`. Caller deals with retry +
|
||
cleanup. Raises on any failure (HTTPStatusError on non-206 response,
|
||
httpx.* on transport blip, `_RangeNotHonored` if server returned 200
|
||
instead of 206 — chunked path can't trust that result)."""
|
||
headers = {"Range": f"bytes={start}-{end}"}
|
||
with client.stream("GET", path, headers=headers) as response:
|
||
# Server didn't honor the Range — RFC says it MAY return 200 with
|
||
# the full body. We can't safely splice that into one part of N,
|
||
# so we abort the whole chunked path and let the caller fall back.
|
||
if response.status_code == 200:
|
||
raise _RangeNotHonored()
|
||
response.raise_for_status()
|
||
with open(part_path, "wb") as f:
|
||
for piece in response.iter_bytes(chunk_size=65536):
|
||
f.write(piece)
|
||
if progress_callback and piece:
|
||
progress_callback(len(piece))
|
||
|
||
|
||
def _download_chunked(
|
||
client: httpx.Client,
|
||
path: str,
|
||
target_path: str,
|
||
total_size: int,
|
||
parallelism: int,
|
||
progress_callback,
|
||
) -> int:
|
||
"""Range-based parallel download. Returns total bytes written.
|
||
|
||
Raises `_RangeNotHonored` on the first 200-instead-of-206 response so
|
||
the caller can fall back. All other exceptions propagate.
|
||
|
||
Cleanup discipline: every part file we create gets removed before
|
||
return (success or failure). The destination is written via the
|
||
caller's `<target>.tmp` and renamed atomically.
|
||
"""
|
||
target = Path(target_path)
|
||
tmp_path = Path(f"{target_path}.tmp")
|
||
parallelism = max(1, parallelism)
|
||
# Build chunks — last chunk takes the remainder.
|
||
chunk_size = total_size // parallelism
|
||
if chunk_size <= 0:
|
||
chunk_size = total_size # tiny file, single chunk
|
||
parallelism = 1
|
||
ranges = []
|
||
for i in range(parallelism):
|
||
start = i * chunk_size
|
||
end = (start + chunk_size - 1) if i < parallelism - 1 else (total_size - 1)
|
||
ranges.append((i, start, end))
|
||
|
||
part_paths = [Path(f"{target_path}.part{i}") for i, _, _ in ranges]
|
||
# Pre-clean any leftovers from a prior run.
|
||
for p in part_paths:
|
||
p.unlink(missing_ok=True)
|
||
|
||
def _attempt_chunk(i: int, start: int, end: int) -> None:
|
||
last_exc: Optional[Exception] = None
|
||
for attempt in range(_RETRY_ATTEMPTS + 1):
|
||
try:
|
||
_download_chunk(
|
||
client, path, start, end, part_paths[i],
|
||
progress_callback,
|
||
)
|
||
return
|
||
except _RangeNotHonored:
|
||
# Don't retry — server policy, not a transport blip.
|
||
raise
|
||
except Exception as exc:
|
||
last_exc = exc
|
||
if attempt == _RETRY_ATTEMPTS or not _is_transient(exc):
|
||
break
|
||
time.sleep(_RETRY_BACKOFFS_S[
|
||
min(attempt, len(_RETRY_BACKOFFS_S) - 1)
|
||
])
|
||
assert last_exc is not None
|
||
raise last_exc
|
||
|
||
try:
|
||
if parallelism == 1:
|
||
_attempt_chunk(*ranges[0])
|
||
else:
|
||
# Use a thread pool so each chunk gets its own concurrent
|
||
# request slot on the (HTTP/2-multiplexed when available)
|
||
# shared client. httpx.Client is thread-safe for stream().
|
||
with ThreadPoolExecutor(max_workers=parallelism) as ex:
|
||
futs = [ex.submit(_attempt_chunk, *r) for r in ranges]
|
||
for fut in as_completed(futs):
|
||
fut.result() # propagate first error
|
||
|
||
# Concatenate parts → tmp_path → atomic rename.
|
||
tmp_path.unlink(missing_ok=True)
|
||
total_written = 0
|
||
with open(tmp_path, "wb") as out:
|
||
for p in part_paths:
|
||
with open(p, "rb") as inp:
|
||
while True:
|
||
block = inp.read(65536)
|
||
if not block:
|
||
break
|
||
out.write(block)
|
||
total_written += len(block)
|
||
os.replace(tmp_path, target)
|
||
return total_written
|
||
finally:
|
||
# Always clean up part files + any stray tmp.
|
||
for p in part_paths:
|
||
p.unlink(missing_ok=True)
|
||
if tmp_path.exists():
|
||
try:
|
||
tmp_path.unlink()
|
||
except OSError:
|
||
pass
|
||
|
||
|
||
def _download_single_stream(
|
||
client: httpx.Client,
|
||
path: str,
|
||
target_path: str,
|
||
progress_callback,
|
||
) -> int:
|
||
"""Original single-stream path with retry. Used when chunking is
|
||
disabled (small file, no range support, or fallback after 200-on-Range)."""
|
||
tmp_path = Path(f"{target_path}.tmp")
|
||
last_exc: Optional[Exception] = None
|
||
for attempt in range(_RETRY_ATTEMPTS + 1):
|
||
try:
|
||
tmp_path.unlink(missing_ok=True)
|
||
with client.stream("GET", path) as response:
|
||
response.raise_for_status()
|
||
total = 0
|
||
with open(tmp_path, "wb") as f:
|
||
for chunk in response.iter_bytes(chunk_size=65536):
|
||
f.write(chunk)
|
||
total += len(chunk)
|
||
if progress_callback:
|
||
progress_callback(len(chunk))
|
||
os.replace(tmp_path, target_path)
|
||
return total
|
||
except Exception as exc:
|
||
last_exc = exc
|
||
if attempt == _RETRY_ATTEMPTS or not _is_transient(exc):
|
||
break
|
||
time.sleep(_RETRY_BACKOFFS_S[min(attempt, len(_RETRY_BACKOFFS_S) - 1)])
|
||
tmp_path.unlink(missing_ok=True)
|
||
assert last_exc is not None
|
||
raise last_exc
|
||
|
||
|
||
def stream_download(path: str, target_path: str, progress_callback=None) -> int:
|
||
"""Stream a file to `target_path` atomically and with retries.
|
||
|
||
Two paths:
|
||
1. **Chunked parallel** — when the server advertises `accept-ranges:
|
||
bytes` and `content-length` exceeds `AGNES_PULL_CHUNK_THRESHOLD_BYTES`
|
||
(default 50 MB), split into N range requests
|
||
(`AGNES_PULL_CHUNK_PARALLELISM`, default 4, capped 1..16) and
|
||
download in parallel. Concatenate the part files into `<target>.tmp`,
|
||
then `os.replace`. Falls back to single-stream if the server
|
||
responds 200 instead of 206 to a Range probe.
|
||
2. **Single-stream** — for small files, no range support, or fallback
|
||
from the chunked path. Same atomic-rename + retry semantics as
|
||
before.
|
||
|
||
Durability properties (unchanged):
|
||
- Writes to `<target>.tmp`, then `os.replace` on success. The real
|
||
target file never exists in a half-written state.
|
||
- Retries up to `_RETRY_ATTEMPTS` on transient errors (network blip,
|
||
5xx); 4xx (auth/404) is raised immediately.
|
||
- No hash check here — that's the caller's job (manifest hash).
|
||
|
||
Threading: the chunked path uses a ThreadPoolExecutor sized to the
|
||
parallelism. httpx.Client.stream() is safe to call concurrently from
|
||
multiple threads on a single client (the connection pool serializes
|
||
the underlying socket access; HTTP/2 multiplexes streams when the
|
||
`h2` extra is installed).
|
||
"""
|
||
# Use the shared persistent client when available — one TLS
|
||
# handshake amortized across N stream_download calls within the same
|
||
# process, and HTTP/2 stream multiplexing across the chunk Range
|
||
# requests within a single download. Falls back to a fresh per-call
|
||
# client if shared-client construction fails for any reason.
|
||
try:
|
||
client = _get_shared_client()
|
||
return _stream_download_via(client, path, target_path, progress_callback)
|
||
except Exception:
|
||
with get_client(timeout=300.0) as client:
|
||
return _stream_download_via(client, path, target_path, progress_callback)
|
||
|
||
|
||
def _stream_download_via(
|
||
client: httpx.Client,
|
||
path: str,
|
||
target_path: str,
|
||
progress_callback,
|
||
) -> int:
|
||
"""The shared body of `stream_download` parameterized on the client.
|
||
Split out so tests can inject a fake client."""
|
||
threshold = _read_chunk_threshold_bytes()
|
||
parallelism = _read_chunk_parallelism()
|
||
|
||
total_size = 0
|
||
accepts_ranges = False
|
||
if parallelism > 1:
|
||
total_size, accepts_ranges = _probe_range_support(client, path)
|
||
|
||
# Sanity bound on the advertised total size (devil's-advocate R1
|
||
# finding #4): a misconfigured proxy or buggy server returning a
|
||
# wildly inflated `Content-Length` would make us split into huge
|
||
# `Range: bytes=N-M` requests; the server then clamps each to actual
|
||
# bytes available, and we end up with overlapping bytes from the
|
||
# start of the file in every part → corrupt assembled output (caught
|
||
# later by manifest hash check, but only after wasted bandwidth).
|
||
# 100 GiB is the operational ceiling for any single materialized
|
||
# parquet on a typical Agnes deployment; values above suggest a
|
||
# server / proxy bug rather than a legitimate huge file. Drop to
|
||
# single-stream (which can't be confused by overlapping chunks).
|
||
SANE_MAX_TOTAL = 100 * 1024**3 # 100 GiB
|
||
if total_size > SANE_MAX_TOTAL:
|
||
total_size = 0
|
||
accepts_ranges = False
|
||
|
||
use_chunked = (
|
||
parallelism > 1
|
||
and accepts_ranges
|
||
and total_size > threshold
|
||
)
|
||
|
||
try:
|
||
if use_chunked:
|
||
try:
|
||
return _download_chunked(
|
||
client, path, target_path, total_size, parallelism,
|
||
progress_callback,
|
||
)
|
||
except _RangeNotHonored:
|
||
# Server lied / proxy stripped the Range — fall through.
|
||
pass
|
||
return _download_single_stream(
|
||
client, path, target_path, progress_callback,
|
||
)
|
||
except httpx.HTTPStatusError:
|
||
# 4xx / 5xx response from the server — re-raise verbatim so the
|
||
# caller's status-code handling + the rich server error body
|
||
# reach the analyst (Devin Review on PR #188).
|
||
raise
|
||
except httpx.HTTPError as exc:
|
||
raise _translate_transport_error(
|
||
exc, context=f"GET {path} (stream → {target_path})",
|
||
timeout_s=300.0,
|
||
) from exc
|