From f33475cec3e7339a799811c79babae443f5bf19c Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 5 May 2026 18:57:04 +0200 Subject: [PATCH] =?UTF-8?q?release:=200.36.0=20=E2=80=94=20perf=20+=20anal?= =?UTF-8?q?yst-clarity=20bundle?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames the [Unreleased] section to [0.36.0] in CHANGELOG, adds the top-level summary, drops a fresh empty [Unreleased] above, and bumps pyproject from 0.35.1. Also fixes the third Devin Review finding on this PR: the CLI ReadTimeout message hardcoded QUERY_TIMEOUT_S (300s) so a 30s-default call (agnes catalog, agnes auth, …) reported a wait window that didn't match reality. _translate_transport_error now takes the actual httpx timeout from the calling helper; the BQ-job advisory only appears for calls where the timeout was set ≥ 60s. --- CHANGELOG.md | 5 +++++ cli/client.py | 45 +++++++++++++++++++++++++++++++++------------ pyproject.toml | 2 +- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 026573c..766a997 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +## [0.36.0] — 2026-05-05 + +Combined performance + analyst-clarity bundle. Folds three previously-staged work streams into one PR (#188): the long-running `agnes query --remote` timeout (#181), the Caddy parquet-download bypass (#182), and Pavel's #185 Phase 1 trace findings (silent 44-min first-init, opaque CLI tracebacks, no analyst-Claude size signal). Also performs the Tier 1 event-loop unblocking — the five hottest BQ-touching endpoints were `async def` over synchronous DuckDB / BQ-extension calls, so a single heavy `agnes query --remote` froze every other request for the duration of the BQ wait. The image-side fixes ship in this release; for existing VMs, the new auto-upgrade.sh self-fetches the matching Caddyfile + compose overlays from `main` on its next 5-minute tick, so deployment requires no operator action beyond letting the cron run. + ### Added - **`agnes init` / `agnes pull --skip-materialize`** — opts the first sync out of materialized-mode tables (server-side scheduled-query parquets, often multi-GB). Pavel's #185 Phase 1: a single 6.3 GB `order_economics` parquet kept first init silent for 44 minutes. Materialized rows stay discoverable via `agnes catalog`; rerun without the flag once the analyst actually needs them locally. - **`agnes pull` progress bar** — Rich-driven aggregate transfer display rendered to stderr when not `--quiet` and not `--json`. Per-file label + bytes / total / rate / ETA, aggregated across the parallel `ThreadPoolExecutor` workers introduced earlier in this PR. Replaces the prior 0-stdout silence on first init. @@ -32,6 +36,7 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ### Fixed +- **CLI ReadTimeout message reports the actual httpx timeout** (was hardcoded to `QUERY_TIMEOUT_S` = 300s). On a 30s-default call (`agnes catalog`, `agnes auth`, …) the analyst saw "didn't respond within the read timeout (300s)" while the call had actually given up after 30s — confusing and unactionable. The translator now takes the real timeout from the calling helper and renders it; the long-running-BQ advisory only appears for calls where the timeout was set ≥ 60s. Devin Review on PR #188. - Keboola sync now falls back to the legacy Storage-API client when the DuckDB Keboola extension's per-table scan fails, not just when the initial `ATTACH` fails. Two changes: - `kbcstorage>=0.9.0` is promoted from optional to core dependency. The legacy fallback path in `connectors/keboola/extractor.py:_extract_via_legacy` has been there since the extension landed, but until now the bare `from kbcstorage.client import Client` would crash any default install with `ModuleNotFoundError`. - `connectors/keboola/extractor.py:run` now wraps `_extract_via_extension` in a per-table try/except — on any per-table scan failure it retries via the legacy client. Previously, when `ATTACH` succeeded but the table-level `COPY (SELECT * FROM kbc.""."")` failed, the table was just marked failed with no retry. diff --git a/cli/client.py b/cli/client.py index 1c665b9..544026c 100644 --- a/cli/client.py +++ b/cli/client.py @@ -68,23 +68,43 @@ def _log_traceback(exc: BaseException, *, context: str) -> Path: return _LOG_FILE -def _translate_transport_error(exc: Exception, *, context: str) -> AgnesTransportError: +def _translate_transport_error( + exc: Exception, *, context: str, timeout_s: float | None = None, +) -> AgnesTransportError: """Map httpx transport exceptions to user-facing CLI messages. The mapping is intentionally pragmatic — analysts care about "what do I - do next", not the gRPC / TCP detail.""" + do next", not the gRPC / TCP detail. + + `timeout_s`, when supplied, is the actual httpx timeout used by the + failing call so the ReadTimeout message reports the real wait window + (a `agnes catalog` GET dies at 30s, not 300s — Devin Review on PR + #188 caught the original signature hardcoding `QUERY_TIMEOUT_S`, + which only matches `agnes query --remote`).""" log = _log_traceback(exc, context=context) if isinstance(exc, httpx.ReadTimeout): - return AgnesTransportError( - f"Server didn't respond within the read timeout ({QUERY_TIMEOUT_S:.0f}s) " - f"for {context}.", - hint=( + wait_s = timeout_s if timeout_s is not None else QUERY_TIMEOUT_S + # The "long-running BQ" advisory only makes sense when the call + # actually hit the query path (timeout ≥ ~60s). For short calls + # (the 30s default on `agnes catalog` etc.) it's just confusing. + if wait_s >= 60: + hint = ( "If this is `agnes query --remote` against a heavy BQ view, " "the underlying BQ job took longer than the wait window. Try:\n" " • narrow the WHERE (especially the partition column from `agnes catalog --json`)\n" " • `agnes snapshot create
... --estimate` to materialize once + query locally\n" " • set AGNES_QUERY_TIMEOUT=600 for a longer client-side wait\n" f"Full traceback: {log}" - ), + ) + else: + hint = ( + "Server is slow or unreachable. Check `agnes status`; " + "re-run if transient.\n" + f"Full traceback: {log}" + ) + return AgnesTransportError( + f"Server didn't respond within the read timeout ({wait_s:.0f}s) " + f"for {context}.", + hint=hint, logfile_path=log, ) if isinstance(exc, httpx.ConnectError): @@ -140,7 +160,7 @@ def api_get(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: with get_client(timeout=timeout) as client: return client.get(path, **kwargs) except httpx.HTTPError as exc: - raise _translate_transport_error(exc, context=f"GET {path}") from exc + raise _translate_transport_error(exc, context=f"GET {path}", timeout_s=timeout) from exc def api_post(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: @@ -148,7 +168,7 @@ def api_post(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: with get_client(timeout=timeout) as client: return client.post(path, **kwargs) except httpx.HTTPError as exc: - raise _translate_transport_error(exc, context=f"POST {path}") from exc + raise _translate_transport_error(exc, context=f"POST {path}", timeout_s=timeout) from exc def api_delete(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: @@ -156,7 +176,7 @@ def api_delete(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: with get_client(timeout=timeout) as client: return client.delete(path, **kwargs) except httpx.HTTPError as exc: - raise _translate_transport_error(exc, context=f"DELETE {path}") from exc + raise _translate_transport_error(exc, context=f"DELETE {path}", timeout_s=timeout) from exc def api_patch(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: @@ -164,7 +184,7 @@ def api_patch(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: with get_client(timeout=timeout) as client: return client.patch(path, **kwargs) except httpx.HTTPError as exc: - raise _translate_transport_error(exc, context=f"PATCH {path}") from exc + raise _translate_transport_error(exc, context=f"PATCH {path}", timeout_s=timeout) from exc def _is_transient(exc: Exception) -> bool: @@ -227,6 +247,7 @@ def stream_download(path: str, target_path: str, progress_callback=None) -> int: raise last_exc if isinstance(last_exc, httpx.HTTPError): raise _translate_transport_error( - last_exc, context=f"GET {path} (stream → {target_path})" + last_exc, context=f"GET {path} (stream → {target_path})", + timeout_s=300.0, ) from last_exc raise last_exc diff --git a/pyproject.toml b/pyproject.toml index 7c9ae61..8fddab2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agnes-the-ai-analyst" -version = "0.35.1" +version = "0.36.0" description = "Agnes — AI Data Analyst platform for AI analytical systems" requires-python = ">=3.11,<3.14" license = "MIT"