Merge pull request #184 from keboola/vr/remote-query-timeout

fix(cli): bump --remote query timeout to 300s, add AGNES_QUERY_TIMEOUT
This commit is contained in:
ZdenekSrotyr 2026-05-05 15:15:02 +02:00 committed by GitHub
commit 91f2605865
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 43 additions and 11 deletions

View file

@ -10,6 +10,12 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
## [Unreleased] ## [Unreleased]
## [0.35.1] — 2026-05-05
### Fixed
- `agnes query --remote` no longer dies after 30s on long-running BigQuery SELECTs. The CLI HTTP client now defaults to a 300s timeout for `/api/query` and exposes `AGNES_QUERY_TIMEOUT` (seconds, float) for operators who need to extend it further. Other CLI calls keep the 30s default. (`cli/client.py`, `cli/commands/query.py`)
## [0.35.0] — 2026-05-05 ## [0.35.0] — 2026-05-05
Five-defect fix for the silently-broken session pipeline on default Compose deploys (#176). Sessions uploaded by `agnes push` landed on `/data/user_sessions/<user>/*.jsonl`, but on a stock `docker compose up` deploy nothing ever processed them — `/corporate-memory` stayed empty even when sessions and `CLAUDE.local.md` were uploaded. The root cause was a stack of compounding defects: LLM SDKs were dev-only deps so the scheduler container boot-looped on `ModuleNotFoundError`, the side-car services were profile-gated and ran as tight `restart: unless-stopped` boot loops anyway, the `verification_detector` had no scheduler entry at all, the first-time setup never seeded an `ai:` block, and the `/corporate-memory` page silently filtered out the pending review queue. This release wires the LLM pipeline into the existing scheduler-v2 model (one HTTP-driven cron tick per service) and adds a health-check that warns when uploaded jsonls aren't being processed. Five-defect fix for the silently-broken session pipeline on default Compose deploys (#176). Sessions uploaded by `agnes push` landed on `/data/user_sessions/<user>/*.jsonl`, but on a stock `docker compose up` deploy nothing ever processed them — `/corporate-memory` stayed empty even when sessions and `CLAUDE.local.md` were uploaded. The root cause was a stack of compounding defects: LLM SDKs were dev-only deps so the scheduler container boot-looped on `ModuleNotFoundError`, the side-car services were profile-gated and ran as tight `restart: unless-stopped` boot loops anyway, the `verification_detector` had no scheduler entry at all, the first-time setup never seeded an `ai:` block, and the `/corporate-memory` page silently filtered out the pending review queue. This release wires the LLM pipeline into the existing scheduler-v2 model (one HTTP-driven cron tick per service) and adds a health-check that warns when uploaded jsonls aren't being processed.

View file

@ -15,6 +15,11 @@ from cli.config import get_server_url, get_token
_RETRY_ATTEMPTS = int(os.environ.get("AGNES_STREAM_RETRIES", "3")) _RETRY_ATTEMPTS = int(os.environ.get("AGNES_STREAM_RETRIES", "3"))
_RETRY_BACKOFFS_S = (0.3, 1.0, 3.0) # seconds before attempt 2, 3, 4 _RETRY_BACKOFFS_S = (0.3, 1.0, 3.0) # seconds before attempt 2, 3, 4
# Long-running query timeout. /api/query forwards to BigQuery for remote
# tables, where SELECTs routinely run for minutes. The default 30s HTTP
# timeout dies long before BQ finishes. Operators tune via AGNES_QUERY_TIMEOUT.
QUERY_TIMEOUT_S = float(os.environ.get("AGNES_QUERY_TIMEOUT", "300"))
def get_client(timeout: float = 30.0) -> httpx.Client: def get_client(timeout: float = 30.0) -> httpx.Client:
"""Get an authenticated httpx client.""" """Get an authenticated httpx client."""
@ -29,23 +34,23 @@ def get_client(timeout: float = 30.0) -> httpx.Client:
) )
def api_get(path: str, **kwargs) -> httpx.Response: def api_get(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
with get_client() as client: with get_client(timeout=timeout) as client:
return client.get(path, **kwargs) return client.get(path, **kwargs)
def api_post(path: str, **kwargs) -> httpx.Response: def api_post(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
with get_client() as client: with get_client(timeout=timeout) as client:
return client.post(path, **kwargs) return client.post(path, **kwargs)
def api_delete(path: str, **kwargs) -> httpx.Response: def api_delete(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
with get_client() as client: with get_client(timeout=timeout) as client:
return client.delete(path, **kwargs) return client.delete(path, **kwargs)
def api_patch(path: str, **kwargs) -> httpx.Response: def api_patch(path: str, *, timeout: float = 30.0, **kwargs) -> httpx.Response:
with get_client() as client: with get_client(timeout=timeout) as client:
return client.patch(path, **kwargs) return client.patch(path, **kwargs)

View file

@ -85,10 +85,14 @@ def _query_local(sql: str, fmt: str, limit: int):
def _query_remote(sql: str, fmt: str, limit: int): def _query_remote(sql: str, fmt: str, limit: int):
"""Run query against server DuckDB via API.""" """Run query against server DuckDB via API."""
from cli.client import api_post from cli.client import QUERY_TIMEOUT_S, api_post
from cli.error_render import render_error from cli.error_render import render_error
resp = api_post("/api/query", json={"sql": sql, "limit": limit}) resp = api_post(
"/api/query",
json={"sql": sql, "limit": limit},
timeout=QUERY_TIMEOUT_S,
)
if resp.status_code != 200: if resp.status_code != 200:
# Parse JSON body if possible, fall back to text. The shared # Parse JSON body if possible, fall back to text. The shared
# renderer pretty-prints typed BQ errors (cross_project_forbidden, # renderer pretty-prints typed BQ errors (cross_project_forbidden,

View file

@ -1,6 +1,6 @@
[project] [project]
name = "agnes-the-ai-analyst" name = "agnes-the-ai-analyst"
version = "0.35.0" version = "0.35.1"
description = "Agnes — AI Data Analyst platform for AI analytical systems" description = "Agnes — AI Data Analyst platform for AI analytical systems"
requires-python = ">=3.11,<3.14" requires-python = ">=3.11,<3.14"
license = "MIT" license = "MIT"

View file

@ -55,6 +55,23 @@ class TestRemoteQuery:
assert result.exit_code == 0 assert result.exit_code == 0
assert "truncated" in result.output assert "truncated" in result.output
def test_remote_query_uses_long_timeout(self):
"""--remote passes the long-running QUERY_TIMEOUT_S to api_post.
BigQuery SELECTs routinely take minutes; the default 30s httpx
timeout dies long before the query finishes. Regression guard for
the fix that introduced AGNES_QUERY_TIMEOUT (default 300s).
"""
from cli.client import QUERY_TIMEOUT_S
payload = {"columns": [], "rows": [], "truncated": False}
mock_post = MagicMock(return_value=_resp(200, payload))
with patch("cli.client.api_post", mock_post):
result = runner.invoke(app, ["query", "SELECT 1", "--remote"])
assert result.exit_code == 0
assert mock_post.call_args.kwargs["timeout"] == QUERY_TIMEOUT_S
assert QUERY_TIMEOUT_S >= 300.0
class TestLocalQuery: class TestLocalQuery:
def test_local_query_no_db(self, tmp_config): def test_local_query_no_db(self, tmp_config):