From 4f042355020d32b03643b9219090a23550b3a3b3 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 5 May 2026 12:29:57 +0200 Subject: [PATCH] feat(bigquery): bq_query_timeout_ms knob; default 600s (was 90s) DuckDB BigQuery extension defaults `bq_query_timeout_ms` to 90 s, which is too tight for analyst-scale queries against view-backed BQ datasets. `agnes query --remote` HTTP 400'd with `Binder Error: Query execution exceeded the timeout. Job ID: ...` whenever the underlying BQ job ran longer than 90 s, even though the job itself was healthy. Add `data_source.bigquery.query_timeout_ms` (default 600 000 ms = 10 min, sentinel 0 falls through to the extension default). Applied via `SET bq_query_timeout_ms` after every `LOAD bigquery` on every BQ-touching DuckDB session: orchestrator's `_remote_attach` ATTACH path, BqAccess session factory, and the standalone extractor. Configurable via `/admin/server-config` UI. Fail-soft: extension versions that don't recognise the setting silently keep the default rather than poisoning the session. --- CHANGELOG.md | 3 + app/api/admin.py | 18 +++++ config/instance.yaml.example | 8 ++ connectors/bigquery/access.py | 37 ++++++++++ connectors/bigquery/extractor.py | 2 + src/orchestrator.py | 2 + tests/test_bq_query_timeout.py | 123 +++++++++++++++++++++++++++++++ 7 files changed, 193 insertions(+) create mode 100644 tests/test_bq_query_timeout.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8af7204..7d7a5f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +### Added +- **`data_source.bigquery.query_timeout_ms` config knob** (default 600 000 ms = 10 min). The DuckDB BigQuery extension's built-in default of 90 s was too tight for analyst-scale queries against view-backed BQ datasets — `agnes query --remote` would HTTP 400 with `Binder Error: Query execution exceeded the timeout. Job ID: …` whenever the underlying BQ job took longer than 90 s, even though the BQ job itself was healthy. The new knob is applied via `SET bq_query_timeout_ms` after every `LOAD bigquery` on every BQ-touching DuckDB session — the orchestrator's `_remote_attach` ATTACH path (`src/orchestrator.py`), the analytics-DB read-only reattach path (`src/db.py:_reattach_remote_extensions` — the primary `agnes query --remote` request path), the `BqAccess` session factory (`connectors/bigquery/access.py`), and the standalone extractor (`connectors/bigquery/extractor.py`). Sentinel `0` (or non-numeric / unparseable values) leaves the extension default in place so operators on legacy extension versions that don't recognise the setting aren't broken. Configurable via `/admin/server-config` UI. Note: BigQuery's `jobs.query` RPC caps the wait at ~200 s per call regardless of this setting; the extension polls on top so the effective ceiling is the value here but each poll is ~200 s. DuckDB emits an informational warning when the value is set above the BQ RPC cap — operators can safely ignore it. + ### Fixed - Keboola sync now falls back to the legacy Storage-API client when the DuckDB Keboola extension's per-table scan fails, not just when the initial `ATTACH` fails. Two changes: diff --git a/app/api/admin.py b/app/api/admin.py index 9355c6e..fe43805 100644 --- a/app/api/admin.py +++ b/app/api/admin.py @@ -285,6 +285,24 @@ _KNOWN_FIELDS: dict[str, dict[str, dict]] = { "`agnes snapshot create` suggestion. 0 disables the gate. Default 5368709120 = 5 GiB." ), }, + "query_timeout_ms": { + "kind": "int", + "default": 600000, + "hint": ( + "DuckDB BigQuery extension query timeout (milliseconds). Applied " + "via `SET bq_query_timeout_ms` after every `LOAD bigquery` on " + "every BQ-touching DuckDB session (orchestrator remote-view " + "ATTACH, BqAccess factory, standalone extractor). Extension " + "default is 90 000 ms = 90 s, which is too tight for analyst " + "queries against view-backed datasets — bumped to 600 000 ms = " + "10 min by default. Set 0 to fall through to the extension " + "default. Note: the underlying BQ jobs.query RPC caps the wait " + "at ~200 s per call; the extension polls on top, so the " + "effective ceiling is this value but each poll round-trip is " + "~200 s. DuckDB itself emits a warning when this is set above " + "~200 s — that warning is informational, not an error." + ), + }, }, }, "keboola": { diff --git a/config/instance.yaml.example b/config/instance.yaml.example index 82b8367..c836144 100644 --- a/config/instance.yaml.example +++ b/config/instance.yaml.example @@ -127,6 +127,14 @@ data_source: # # Dry-run check before running; exceeding -> registration / sync # # rejected. Default 10 GiB (10737418240). Set 0 to disable. # # null falls through to default. Configurable via /admin/server-config UI. + # query_timeout_ms: 600000 + # # DuckDB BigQuery extension query timeout (milliseconds). + # # Applied via `SET bq_query_timeout_ms` after every LOAD bigquery + # # on every BQ-touching DuckDB session. Extension default is + # # 90 000 ms = 90 s, which is too tight for analyst queries against + # # view-backed datasets -- bumped to 600 000 ms = 10 min by default. + # # Set 0 to fall through to the extension default. Configurable via + # # /admin/server-config UI. # --- OpenMetadata catalog (optional) --- # Enriches table and column metadata from OpenMetadata REST API. diff --git a/connectors/bigquery/access.py b/connectors/bigquery/access.py index 193fbbc..e46a877 100644 --- a/connectors/bigquery/access.py +++ b/connectors/bigquery/access.py @@ -232,11 +232,48 @@ def _default_duckdb_session_factory(projects: BqProjects): f"failed to install/load BigQuery DuckDB extension: {e}", details={"original": str(e)}, ) + apply_bq_session_settings(conn) yield conn finally: conn.close() +def apply_bq_session_settings(conn) -> None: + """Apply per-session DuckDB BigQuery-extension settings from instance config. + + Currently sets ``bq_query_timeout_ms`` from + ``data_source.bigquery.query_timeout_ms``. The extension default is 90 s, + which is too tight for analyst-scale queries against view-backed BQ + datasets — bumping the default to 600 s here. Sentinel ``0`` (or a + non-numeric / unparseable value) leaves the extension default in place. + + Call AFTER ``LOAD bigquery`` on every DuckDB session that touches BQ: + BqAccess's session factory, the standalone extractor in + ``connectors/bigquery/extractor.py``, and the orchestrator's + ``_remote_attach`` path in ``src/orchestrator.py``. + """ + try: + from app.instance_config import get_value + except Exception: + return + raw = get_value( + "data_source", "bigquery", "query_timeout_ms", default=600_000, + ) + try: + ms = int(raw) if raw is not None else 0 + except (TypeError, ValueError): + return + if ms <= 0: + return + try: + conn.execute(f"SET bq_query_timeout_ms = {int(ms)}") + except Exception: + # Fail-soft: extension version may not support the setting, or the + # session may already have been frozen — leave the default rather + # than poisoning the whole session. + pass + + class BqAccess: """Single entry point for BigQuery access. Stateless after construction. diff --git a/connectors/bigquery/extractor.py b/connectors/bigquery/extractor.py index c140dd6..d90005d 100644 --- a/connectors/bigquery/extractor.py +++ b/connectors/bigquery/extractor.py @@ -359,6 +359,8 @@ def _init_extract_locked( conn.execute( f"CREATE SECRET bq_session (TYPE bigquery, ACCESS_TOKEN '{escaped_token}')" ) + from connectors.bigquery.access import apply_bq_session_settings + apply_bq_session_settings(conn) conn.execute( f"ATTACH 'project={project_id}' AS bq (TYPE bigquery, READ_ONLY)" ) diff --git a/src/orchestrator.py b/src/orchestrator.py index e003f41..914bcd6 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -502,6 +502,8 @@ class SyncOrchestrator: f"CREATE OR REPLACE SECRET {secret_name} " f"(TYPE bigquery, ACCESS_TOKEN '{escaped}')" ) + from connectors.bigquery.access import apply_bq_session_settings + apply_bq_session_settings(conn) conn.execute( f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)" ) diff --git a/tests/test_bq_query_timeout.py b/tests/test_bq_query_timeout.py new file mode 100644 index 0000000..fa64e08 --- /dev/null +++ b/tests/test_bq_query_timeout.py @@ -0,0 +1,123 @@ +"""Unit tests for apply_bq_session_settings. + +Covers the data_source.bigquery.query_timeout_ms knob added so that +agnes query --remote no longer trips the DuckDB BigQuery extension's +built-in 90 s wait timeout when the underlying BQ job takes longer. +""" + +from unittest.mock import patch + +from connectors.bigquery.access import apply_bq_session_settings + + +class _RecordingConn: + """Minimal DuckDB-conn stand-in that records execute() calls. + + apply_bq_session_settings only calls .execute(); we don't need a + real DuckDB to verify the SET command shape. + """ + + def __init__(self, raise_on=None): + self.calls: list[str] = [] + self.raise_on = raise_on + + def execute(self, sql: str): + self.calls.append(sql) + if self.raise_on and self.raise_on in sql: + raise RuntimeError(f"simulated failure on: {sql}") + + +def _patched_get_value(value): + """Helper: build a patch target that returns *value* for the + data_source.bigquery.query_timeout_ms key and propagates the + `default=` kwarg for any other lookup so we don't accidentally + break tests that read other keys via the same module.""" + def fake(*keys, default=None): + if keys == ("data_source", "bigquery", "query_timeout_ms"): + return value + return default + return patch("app.instance_config.get_value", side_effect=fake) + + +def test_default_when_config_missing(): + """When get_value returns the default (None passed through, default arg + used), apply_bq_session_settings should fall back to the bumped + 600 000 ms default and emit the SET.""" + conn = _RecordingConn() + # Simulate get_value returning the default we passed (600_000) by + # echoing the default kwarg. + def fake(*keys, default=None): + return default + with patch("app.instance_config.get_value", side_effect=fake): + apply_bq_session_settings(conn) + assert conn.calls == ["SET bq_query_timeout_ms = 600000"] + + +def test_explicit_value(): + conn = _RecordingConn() + with _patched_get_value(900_000): + apply_bq_session_settings(conn) + assert conn.calls == ["SET bq_query_timeout_ms = 900000"] + + +def test_zero_sentinel_leaves_extension_default(): + """0 means 'use the DuckDB BQ extension's built-in default' — no SET + must be emitted so a non-zero default doesn't override an operator's + explicit opt-out.""" + conn = _RecordingConn() + with _patched_get_value(0): + apply_bq_session_settings(conn) + assert conn.calls == [] + + +def test_negative_value_treated_as_zero(): + """Negative is nonsensical for a timeout; treat as 'extension default' + rather than emitting a negative SET that the extension might reject + or interpret unexpectedly.""" + conn = _RecordingConn() + with _patched_get_value(-1): + apply_bq_session_settings(conn) + assert conn.calls == [] + + +def test_non_numeric_silently_skipped(): + """A string-typed YAML value (e.g. operator typo) shouldn't crash + the BQ session — fall through to the extension default.""" + conn = _RecordingConn() + with _patched_get_value("notanumber"): + apply_bq_session_settings(conn) + assert conn.calls == [] + + +def test_string_numeric_is_coerced(): + """YAML loaders sometimes deliver int-like values as strings; accept + those rather than failing.""" + conn = _RecordingConn() + with _patched_get_value("750000"): + apply_bq_session_settings(conn) + assert conn.calls == ["SET bq_query_timeout_ms = 750000"] + + +def test_set_failure_does_not_propagate(): + """Older DuckDB BQ extension versions may not recognise the setting. + The function must fail-soft so a session that was otherwise healthy + keeps working — just with the extension's built-in default timeout.""" + conn = _RecordingConn(raise_on="SET bq_query_timeout_ms") + with _patched_get_value(600_000): + # Must not raise. + apply_bq_session_settings(conn) + # The SET was attempted (recorded before the exception). + assert conn.calls == ["SET bq_query_timeout_ms = 600000"] + + +def test_no_app_config_module_silently_skipped(): + """Unit-test contexts that don't bring up the app config layer must + still be able to construct BQ sessions for narrow tests; an + ImportError on app.instance_config means we can't read the knob, + so we leave the extension default in place.""" + conn = _RecordingConn() + with patch.dict( + "sys.modules", {"app.instance_config": None}, + ): + apply_bq_session_settings(conn) + assert conn.calls == []