From 0843c2bd1b8af1668cc28c1d774d742f8594db5d Mon Sep 17 00:00:00 2001 From: Vojtech Rysanek Date: Tue, 5 May 2026 16:40:54 +0400 Subject: [PATCH 1/2] fix(cli): bump --remote query timeout to 300s, add AGNES_QUERY_TIMEOUT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The httpx client behind 'agnes query --remote' used the default 30s timeout, killing every BigQuery SELECT that took longer than half a minute — i.e. most non-trivial remote queries. cli/client.py now exposes QUERY_TIMEOUT_S (default 300s, override via AGNES_QUERY_TIMEOUT) and propagates a kw-only 'timeout' through api_get/post/delete/patch. _query_remote passes QUERY_TIMEOUT_S so only the long-running /api/query path gets the bump; every other CLI call keeps the 30s default. Server-side has no read deadline on /api/query, so the client cap was the sole bottleneck. --- CHANGELOG.md | 4 ++++ cli/client.py | 21 +++++++++++++-------- cli/commands/query.py | 8 ++++++-- tests/test_cli_query.py | 17 +++++++++++++++++ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97fc44a..a7bec4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +### Fixed + +- `agnes query --remote` no longer dies after 30s on long-running BigQuery SELECTs. The CLI HTTP client now defaults to a 300s timeout for `/api/query` and exposes `AGNES_QUERY_TIMEOUT` (seconds, float) for operators who need to extend it further. Other CLI calls keep the 30s default. (`cli/client.py`, `cli/commands/query.py`) + ## [0.35.0] — 2026-05-05 Five-defect fix for the silently-broken session pipeline on default Compose deploys (#176). Sessions uploaded by `agnes push` landed on `/data/user_sessions//*.jsonl`, but on a stock `docker compose up` deploy nothing ever processed them — `/corporate-memory` stayed empty even when sessions and `CLAUDE.local.md` were uploaded. The root cause was a stack of compounding defects: LLM SDKs were dev-only deps so the scheduler container boot-looped on `ModuleNotFoundError`, the side-car services were profile-gated and ran as tight `restart: unless-stopped` boot loops anyway, the `verification_detector` had no scheduler entry at all, the first-time setup never seeded an `ai:` block, and the `/corporate-memory` page silently filtered out the pending review queue. This release wires the LLM pipeline into the existing scheduler-v2 model (one HTTP-driven cron tick per service) and adds a health-check that warns when uploaded jsonls aren't being processed. diff --git a/cli/client.py b/cli/client.py index 47559ec..1efdd7d 100644 --- a/cli/client.py +++ b/cli/client.py @@ -15,6 +15,11 @@ from cli.config import get_server_url, get_token _RETRY_ATTEMPTS = int(os.environ.get("AGNES_STREAM_RETRIES", "3")) _RETRY_BACKOFFS_S = (0.3, 1.0, 3.0) # seconds before attempt 2, 3, 4 +# Long-running query timeout. /api/query forwards to BigQuery for remote +# tables, where SELECTs routinely run for minutes. The default 30s HTTP +# timeout dies long before BQ finishes. Operators tune via AGNES_QUERY_TIMEOUT. +QUERY_TIMEOUT_S = float(os.environ.get("AGNES_QUERY_TIMEOUT", "300")) + def get_client(timeout: float = 30.0) -> httpx.Client: """Get an authenticated httpx client.""" @@ -29,23 +34,23 @@ def get_client(timeout: float = 30.0) -> httpx.Client: ) -def api_get(path: str, **kwargs) -> httpx.Response: - with get_client() as client: +def api_get(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: + with get_client(timeout=timeout) as client: return client.get(path, **kwargs) -def api_post(path: str, **kwargs) -> httpx.Response: - with get_client() as client: +def api_post(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: + with get_client(timeout=timeout) as client: return client.post(path, **kwargs) -def api_delete(path: str, **kwargs) -> httpx.Response: - with get_client() as client: +def api_delete(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: + with get_client(timeout=timeout) as client: return client.delete(path, **kwargs) -def api_patch(path: str, **kwargs) -> httpx.Response: - with get_client() as client: +def api_patch(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response: + with get_client(timeout=timeout) as client: return client.patch(path, **kwargs) diff --git a/cli/commands/query.py b/cli/commands/query.py index 35af5cc..5524e56 100644 --- a/cli/commands/query.py +++ b/cli/commands/query.py @@ -85,10 +85,14 @@ def _query_local(sql: str, fmt: str, limit: int): def _query_remote(sql: str, fmt: str, limit: int): """Run query against server DuckDB via API.""" - from cli.client import api_post + from cli.client import QUERY_TIMEOUT_S, api_post from cli.error_render import render_error - resp = api_post("/api/query", json={"sql": sql, "limit": limit}) + resp = api_post( + "/api/query", + json={"sql": sql, "limit": limit}, + timeout=QUERY_TIMEOUT_S, + ) if resp.status_code != 200: # Parse JSON body if possible, fall back to text. The shared # renderer pretty-prints typed BQ errors (cross_project_forbidden, diff --git a/tests/test_cli_query.py b/tests/test_cli_query.py index 85026fc..2af1bd7 100644 --- a/tests/test_cli_query.py +++ b/tests/test_cli_query.py @@ -55,6 +55,23 @@ class TestRemoteQuery: assert result.exit_code == 0 assert "truncated" in result.output + def test_remote_query_uses_long_timeout(self): + """--remote passes the long-running QUERY_TIMEOUT_S to api_post. + + BigQuery SELECTs routinely take minutes; the default 30s httpx + timeout dies long before the query finishes. Regression guard for + the fix that introduced AGNES_QUERY_TIMEOUT (default 300s). + """ + from cli.client import QUERY_TIMEOUT_S + + payload = {"columns": [], "rows": [], "truncated": False} + mock_post = MagicMock(return_value=_resp(200, payload)) + with patch("cli.client.api_post", mock_post): + result = runner.invoke(app, ["query", "SELECT 1", "--remote"]) + assert result.exit_code == 0 + assert mock_post.call_args.kwargs["timeout"] == QUERY_TIMEOUT_S + assert QUERY_TIMEOUT_S >= 300.0 + class TestLocalQuery: def test_local_query_no_db(self, tmp_config): From a22095564092252ee765dea808de6de02afa022e Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 5 May 2026 15:01:37 +0200 Subject: [PATCH 2/2] =?UTF-8?q?release:=200.35.1=20=E2=80=94=20CLI=20--rem?= =?UTF-8?q?ote=20query=20timeout=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch release bundling the only Unreleased change: bump httpx client timeout for agnes query --remote from 30s to 300s (configurable via AGNES_QUERY_TIMEOUT). Renames CHANGELOG [Unreleased] section to [0.35.1] and bumps pyproject version to match. --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7bec4d..2d3a8d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +## [0.35.1] — 2026-05-05 + ### Fixed - `agnes query --remote` no longer dies after 30s on long-running BigQuery SELECTs. The CLI HTTP client now defaults to a 300s timeout for `/api/query` and exposes `AGNES_QUERY_TIMEOUT` (seconds, float) for operators who need to extend it further. Other CLI calls keep the 30s default. (`cli/client.py`, `cli/commands/query.py`) diff --git a/pyproject.toml b/pyproject.toml index 8fd52a8..4359a74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agnes-the-ai-analyst" -version = "0.35.0" +version = "0.35.1" description = "Agnes — AI Data Analyst platform for AI analytical systems" requires-python = ">=3.11,<3.14" license = "MIT"