From c32be3fe96cad3bfae2b53be1d501ef5fb187ec9 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Wed, 6 May 2026 17:52:18 +0200 Subject: [PATCH] fix(query): cap-guard fallback retries original SQL, fails fast (#201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When BQ rejects the rewritten dry-run SQL with `bq_bad_request`, the cap-guard now retries with the user's ORIGINAL SQL instead of building a synthetic `SELECT * FROM ` per registered table. The synthetic path threw away user filters / projections / partition predicates and routinely ballooned the estimate to "full table size", falsely tripping `remote_scan_too_large` on legitimate narrow queries (typical issue #201 trace: rewriter corrupts a backtick path → BQ parse error → synthetic over-estimate → 400). Behaviour: - Rewritten SQL succeeds: same as before (issue #171 single-dry-run). - Rewritten SQL parse-errors, original SQL succeeds: use original estimate. Common case for users submitting BQ-native input. - Both fail with `bq_bad_request`: HTTP 400 `remote_estimate_failed` with a hint pointing at `agnes catalog` / BQ-native syntax. No silent over-estimate. - Non-parse BQ error (forbidden, upstream): still 502 as before. This is a behaviour change for clients matching error kinds — failure to estimate scan size now surfaces as `remote_estimate_failed` instead of being masked behind `remote_scan_too_large` from the synthetic path. Replaces the existing `test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error` (which pinned the old contract) with `test_fallback_tries_original_sql_first` and `test_fallback_fails_fast_on_pure_duckdb_syntax`. --- app/api/query.py | 124 +++++++++++++++++------------- tests/test_api_query_guardrail.py | 100 ++++++++++++++++++------ 2 files changed, 145 insertions(+), 79 deletions(-) diff --git a/app/api/query.py b/app/api/query.py index 5ab0d63..0da45d4 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -891,28 +891,32 @@ def _bq_quota_and_cap_guard( partitioned/clustered tables and rejecting narrow queries that BQ itself would dry-run as a few MB. - Fallback: if BQ rejects the rewritten SQL with a parse-level - ``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that - don't translate to BQ), fall back to the pre-#171 per-table - SELECT * approach so the cap-guard still functions — over-estimate - is preferred over fail-open. Forbidden / upstream errors still - propagate as HTTP 502. + Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level + ``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or + a rewriter bug that broke valid BQ-native input), retry with the + user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the + original ALSO fails, return a structured `remote_estimate_failed` + HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table + over-estimate. The synthetic fallback threw away user filters and + routinely ballooned to "full table size", blocking legitimate narrow + queries via `remote_scan_too_large`. Forbidden / upstream errors + still propagate as HTTP 502. Flow: 1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work. 2. `quota.acquire(user_id)` opened — concurrent-slot held throughout. 3. Single dry-run of rewritten user SQL → `total_bytes`. - On parse error, fall back to per-table SELECT * → sum. + On parse error, retry with the user's original SQL. + On second parse error, raise 400 `remote_estimate_failed`. 4. If total > cap → 400 `remote_scan_too_large`. 5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`. 6. On exit, slot released. Mutates `dry_run_set` in place: the third tuple element (bytes) is populated so the caller can sum and record bytes against the user's - quota post-flight. Single-dry-run path puts `total_bytes` on the first - entry and zero on the rest (BQ doesn't expose per-table bytes for a - composite query); the caller's `sum(b for _, _, b in dry_run_set)` - still equals `total_bytes`. + quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest + — BQ doesn't expose per-table bytes for a composite query — so + `sum(b for _, _, b in dry_run_set)` still equals `total_bytes`. """ quota = _build_quota_tracker() try: @@ -953,61 +957,71 @@ def _bq_quota_and_cap_guard( sql, name_lookups, project, ) - # Try the single-dry-run path first (issue #171). Falls back - # to the per-table SELECT * approach only on BQ parse errors - # (kind="bq_bad_request" — DuckDB-only syntax that BQ can't - # translate). All other BQ errors propagate as 502 below. + # Try the single-dry-run path first (issue #171). On BQ parse + # errors (`bq_bad_request` — typically DuckDB-only syntax the + # rewriter couldn't translate, OR — pre-#201 fix — a + # rewriter-corrupted backtick path) retry the user's ORIGINAL + # SQL: when the user submitted BQ-native SQL, the rewriter is + # the only thing standing between them and a clean dry-run. + # If the original ALSO fails, this is true DuckDB-only syntax + # that BQ cannot estimate — fail fast with a structured + # `remote_estimate_failed` instead of the pre-#201 synthetic + # `SELECT *` over-estimate (which threw away user filters and + # often ballooned to "full table size", blocking legitimate + # narrow queries via `remote_scan_too_large`). + # + # All other BQ errors (forbidden, upstream) propagate as 502. total_bytes = 0 - used_fallback = False try: total_bytes = _bq_dry_run_bytes(bq, rewritten_sql) except BqAccessError as exc: - if exc.kind == "bq_bad_request": - logger.warning( - "BQ dry-run rejected the rewritten SQL " - "(kind=%s, message=%s). Falling back to per-table " - "SELECT * estimate; the cap check will over-estimate " - "scan bytes for this query. Consider rewriting to " - "BQ-native syntax for a tight pre-check.", - exc.kind, exc.message, - ) - used_fallback = True - else: + if exc.kind != "bq_bad_request": raise HTTPException(status_code=502, detail={ "kind": exc.kind, "message": exc.message, **(exc.details or {}), }) - - if used_fallback: - # Pre-#171 path: estimate per registered table from a - # synthetic SELECT *. Over-estimates partitioned scans but - # never under-estimates, so the cap still bounds risk. - for i, (bucket, source_table, _) in enumerate(dry_run_set): - fallback_sql = ( - f"SELECT * FROM `{project}.{bucket}.{source_table}`" - ) - try: - est = _bq_dry_run_bytes(bq, fallback_sql) - except BqAccessError as exc: + logger.warning( + "BQ dry-run rejected the rewritten SQL " + "(kind=%s, message=%s). Retrying with the user's " + "original SQL.", + exc.kind, exc.message, + ) + try: + total_bytes = _bq_dry_run_bytes(bq, sql) + except BqAccessError as exc2: + if exc2.kind != "bq_bad_request": raise HTTPException(status_code=502, detail={ - "kind": exc.kind, - "message": exc.message, - **(exc.details or {}), + "kind": exc2.kind, + "message": exc2.message, + **(exc2.details or {}), }) - dry_run_set[i] = (bucket, source_table, est) - total_bytes += est - else: - # Single-dry-run path. Distribute the total to dry_run_set - # so the caller's `record_bytes(sum(...))` stays correct. - # Per-table breakdown is unavailable from a composite - # dry-run; pin total to entry 0, zero the rest. - if dry_run_set: - b0, t0, _ = dry_run_set[0] - dry_run_set[0] = (b0, t0, total_bytes) - for i in range(1, len(dry_run_set)): - bi, ti, _ = dry_run_set[i] - dry_run_set[i] = (bi, ti, 0) + raise HTTPException(status_code=400, detail={ + "kind": "remote_estimate_failed", + "message": ( + "Could not estimate scan size for this query." + ), + "hint": ( + "Use a registered table name from `agnes " + "catalog`, or write BQ-native SQL with full " + "backtick paths. Pure DuckDB-only syntax is " + "not supported for --remote queries." + ), + "underlying": exc2.message, + }) + + # Distribute the total to dry_run_set so the caller's + # `record_bytes(sum(...))` stays correct. Per-table breakdown + # is unavailable from a composite dry-run; pin total to entry + # 0, zero the rest. (Same accounting symmetry whether the + # bytes came from the rewritten SQL or the original-SQL + # retry.) + if dry_run_set: + b0, t0, _ = dry_run_set[0] + dry_run_set[0] = (b0, t0, total_bytes) + for i in range(1, len(dry_run_set)): + bi, ti, _ = dry_run_set[i] + dry_run_set[i] = (bi, ti, 0) if cap_bytes > 0 and total_bytes > cap_bytes: tables = [f"{b}.{t}" for b, t, _ in dry_run_set] diff --git a/tests/test_api_query_guardrail.py b/tests/test_api_query_guardrail.py index afd75c9..05f8143 100644 --- a/tests/test_api_query_guardrail.py +++ b/tests/test_api_query_guardrail.py @@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request( assert "`test-data-prj.marketing.traffic`" in state["last_sql"] -def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error( +def test_fallback_tries_original_sql_first( seeded_app, mock_dry_run, monkeypatch, ): - """When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only - syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …), - the cap-guard falls back to the pre-#171 per-table SELECT * approach - so a non-portable query still gets a (loose) cap estimate instead of - fail-opening. - """ + """Issue #201 — when the rewriter produces SQL that BQ rejects with + `bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the + cap-guard uses the original SQL's byte estimate. No more synthetic + `SELECT *` over-estimate. + + Bare-name reference populates `dry_run_set` so the cap-guard + actually fires. Mock returns parse-error on the first call + (rewritten SQL) and small bytes on the second (original).""" from connectors.bigquery.access import BqAccessError _register_bq_remote_row("ue", "finance", "ue") @@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error( def fake_dry_run(_bq, sql): state["calls"].append(sql) - # First call (rewritten user SQL) → BQ parse error. + # First call (rewritten SQL) → BQ parse error. if len(state["calls"]) == 1: - raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'") - # Second call (fallback per-table SELECT *) → small bytes, pass cap. + raise BqAccessError("bq_bad_request", "Syntax error: simulated") + # Second call (the user's original SQL) → small, passes cap. return 4096 monkeypatch.setattr( @@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error( c = seeded_app["client"] token = seeded_app["admin_token"] - # SQL with DuckDB-only `::INT` cast that BQ would reject. + user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'" + r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token)) + + # Two dry-runs: rewritten then original. No third synthetic-SELECT-* + # call. + assert len(state["calls"]) == 2, ( + f"expected rewritten + original-SQL retry, got " + f"{len(state['calls'])}: {state['calls']}" + ) + assert state["calls"][1] == user_sql, ( + f"second call must be the user's ORIGINAL SQL, got " + f"{state['calls'][1]!r}" + ) + # The response must NOT be remote_scan_too_large from a synthetic + # over-estimate — 4096 bytes is well under the 5 GiB cap. + if r.status_code == 400: + detail = r.json().get("detail", {}) + if isinstance(detail, dict): + assert detail.get("reason") != "remote_scan_too_large", detail + + +def test_fallback_fails_fast_on_pure_duckdb_syntax( + seeded_app, mock_dry_run, monkeypatch, +): + """When BOTH the rewritten and original SQL fail with `bq_bad_request` + (true DuckDB-only syntax like `::INT`), return HTTP 400 + `remote_estimate_failed` — never silently over-estimate via a + synthetic `SELECT *`.""" + from connectors.bigquery.access import BqAccessError + + _register_bq_remote_row("ue", "finance", "ue") + + state = {"calls": []} + + def always_parse_error(_bq, sql): + state["calls"].append(sql) + raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'") + + monkeypatch.setattr( + "app.api.query._bq_dry_run_bytes", always_parse_error, raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] r = c.post( "/api/query", json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"}, headers=_auth(token), ) - # Two dry-runs (rewritten + fallback per-table) before the (failed) - # execute. Status will be a downstream error from analytics.execute() - # since `::INT` doesn't work in DuckDB either against a remote view — - # but the GUARDRAIL must have completed without 5xx-ing. + # Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback. assert len(state["calls"]) == 2, ( - f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: " - f"{state['calls']}" + f"expected 1 rewritten + 1 original-retry, got " + f"{len(state['calls'])}: {state['calls']}" ) - assert "::" in state["calls"][0], "first call should be the rewritten user SQL" - assert state["calls"][1].startswith("SELECT * FROM"), ( - "second call should be the per-table fallback" - ) - # Whatever HTTP status comes back must NOT be 502 from the guard's - # transport-error path — fallback must absorb the bq_bad_request. - assert r.status_code != 502, r.json() + # No call should be a synthetic ``SELECT * FROM `...```. The + # original-SQL retry contains the user's SELECT clause. + for c_sql in state["calls"]: + # If a call is just a synthetic ``SELECT * FROM `..
``` + # the user's `WHERE country = 'CZ'` would be missing. + if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql: + raise AssertionError( + f"synthetic SELECT * fallback was used: {c_sql!r}" + ) + + assert r.status_code == 400, r.json() + detail = r.json().get("detail", {}) + assert isinstance(detail, dict), detail + assert detail.get("kind") == "remote_estimate_failed", detail + assert "underlying" in detail, detail + assert "agnes catalog" in detail.get("hint", "").lower() or \ + "backtick" in detail.get("hint", "").lower(), detail def test_guardrail_propagates_502_on_non_parse_bq_errors(