fix(query): cap-guard fallback retries original SQL, fails fast (#201)

When BQ rejects the rewritten dry-run SQL with `bq_bad_request`, the cap-guard now retries with the user's ORIGINAL SQL instead of building a synthetic `SELECT * FROM <table>` per registered table. The synthetic path threw away user filters / projections / partition predicates and routinely ballooned the estimate to "full table size", falsely tripping `remote_scan_too_large` on legitimate narrow queries (typical issue #201 trace: rewriter corrupts a backtick path → BQ parse error → synthetic over-estimate → 400). Behaviour: - Rewritten SQL succeeds: same as before (issue #171 single-dry-run). - Rewritten SQL parse-errors, original SQL succeeds: use original estimate. Common case for users submitting BQ-native input. - Both fail with `bq_bad_request`: HTTP 400 `remote_estimate_failed` with a hint pointing at `agnes catalog` / BQ-native syntax. No silent over-estimate. - Non-parse BQ error (forbidden, upstream): still 502 as before. This is a behaviour change for clients matching error kinds — failure to estimate scan size now surfaces as `remote_estimate_failed` instead of being masked behind `remote_scan_too_large` from the synthetic path. Replaces the existing `test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error` (which pinned the old contract) with `test_fallback_tries_original_sql_first` and `test_fallback_fails_fast_on_pure_duckdb_syntax`.
2026-05-06 17:52:18 +02:00 · 2026-05-06 17:52:18 +02:00 · c32be3fe96
commit c32be3fe96
parent 720a2180c0
2 changed files with 145 additions and 79 deletions
--- a/app/api/query.py
+++ b/app/api/query.py
@ -891,28 +891,32 @@ def _bq_quota_and_cap_guard(
    partitioned/clustered tables and rejecting narrow queries that BQ
    itself would dry-run as a few MB.
-    Fallback: if BQ rejects the rewritten SQL with a parse-level
+    Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level
-    ``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that
+    ``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or
-    don't translate to BQ), fall back to the pre-#171 per-table
+    a rewriter bug that broke valid BQ-native input), retry with the
-    SELECT * approach so the cap-guard still functions — over-estimate
+    user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the
-    is preferred over fail-open. Forbidden / upstream errors still
+    original ALSO fails, return a structured `remote_estimate_failed`
-    propagate as HTTP 502.
+    HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table
    over-estimate. The synthetic fallback threw away user filters and
    routinely ballooned to "full table size", blocking legitimate narrow
    queries via `remote_scan_too_large`. Forbidden / upstream errors
    still propagate as HTTP 502.
    Flow:
    1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work.
    2. `quota.acquire(user_id)` opened — concurrent-slot held throughout.
    3. Single dry-run of rewritten user SQL → `total_bytes`.
-       On parse error, fall back to per-table SELECT * → sum.
+       On parse error, retry with the user's original SQL.
       On second parse error, raise 400 `remote_estimate_failed`.
    4. If total > cap → 400 `remote_scan_too_large`.
    5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
    6. On exit, slot released.
    Mutates `dry_run_set` in place: the third tuple element (bytes) is
    populated so the caller can sum and record bytes against the user's
-    quota post-flight. Single-dry-run path puts `total_bytes` on the first
+    quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest
-    entry and zero on the rest (BQ doesn't expose per-table bytes for a
+    — BQ doesn't expose per-table bytes for a composite query — so
-    composite query); the caller's `sum(b for _, _, b in dry_run_set)`
+    `sum(b for _, _, b in dry_run_set)` still equals `total_bytes`.
    still equals `total_bytes`.
    """
    quota = _build_quota_tracker()
    try:
@ -953,61 +957,71 @@ def _bq_quota_and_cap_guard(
                sql, name_lookups, project,
            )
-            # Try the single-dry-run path first (issue #171). Falls back
+            # Try the single-dry-run path first (issue #171). On BQ parse
-            # to the per-table SELECT * approach only on BQ parse errors
+            # errors (`bq_bad_request` — typically DuckDB-only syntax the
-            # (kind="bq_bad_request" — DuckDB-only syntax that BQ can't
+            # rewriter couldn't translate, OR — pre-#201 fix — a
-            # translate). All other BQ errors propagate as 502 below.
+            # rewriter-corrupted backtick path) retry the user's ORIGINAL
            # SQL: when the user submitted BQ-native SQL, the rewriter is
            # the only thing standing between them and a clean dry-run.
            # If the original ALSO fails, this is true DuckDB-only syntax
            # that BQ cannot estimate — fail fast with a structured
            # `remote_estimate_failed` instead of the pre-#201 synthetic
            # `SELECT *` over-estimate (which threw away user filters and
            # often ballooned to "full table size", blocking legitimate
            # narrow queries via `remote_scan_too_large`).
            #
            # All other BQ errors (forbidden, upstream) propagate as 502.
            total_bytes = 0
            used_fallback = False
            try:
                total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
            except BqAccessError as exc:
-                if exc.kind == "bq_bad_request":
+                if exc.kind != "bq_bad_request":
                    logger.warning(
                        "BQ dry-run rejected the rewritten SQL "
                        "(kind=%s, message=%s). Falling back to per-table "
                        "SELECT * estimate; the cap check will over-estimate "
                        "scan bytes for this query. Consider rewriting to "
                        "BQ-native syntax for a tight pre-check.",
                        exc.kind, exc.message,
                    )
                    used_fallback = True
                else:
                    raise HTTPException(status_code=502, detail={
                        "kind": exc.kind,
                        "message": exc.message,
                        **(exc.details or {}),
                    })
-
+                logger.warning(
-            if used_fallback:
+                    "BQ dry-run rejected the rewritten SQL "
-                # Pre-#171 path: estimate per registered table from a
+                    "(kind=%s, message=%s). Retrying with the user's "
-                # synthetic SELECT *. Over-estimates partitioned scans but
+                    "original SQL.",
-                # never under-estimates, so the cap still bounds risk.
+                    exc.kind, exc.message,
-                for i, (bucket, source_table, _) in enumerate(dry_run_set):
+                )
-                    fallback_sql = (
+                try:
-                        f"SELECT * FROM `{project}.{bucket}.{source_table}`"
+                    total_bytes = _bq_dry_run_bytes(bq, sql)
-                    )
+                except BqAccessError as exc2:
-                    try:
+                    if exc2.kind != "bq_bad_request":
                        est = _bq_dry_run_bytes(bq, fallback_sql)
                    except BqAccessError as exc:
                        raise HTTPException(status_code=502, detail={
-                            "kind": exc.kind,
+                            "kind": exc2.kind,
-                            "message": exc.message,
+                            "message": exc2.message,
-                            **(exc.details or {}),
+                            **(exc2.details or {}),
                        })
-                    dry_run_set[i] = (bucket, source_table, est)
+                    raise HTTPException(status_code=400, detail={
-                    total_bytes += est
+                        "kind": "remote_estimate_failed",
-            else:
+                        "message": (
-                # Single-dry-run path. Distribute the total to dry_run_set
+                            "Could not estimate scan size for this query."
-                # so the caller's `record_bytes(sum(...))` stays correct.
+                        ),
-                # Per-table breakdown is unavailable from a composite
+                        "hint": (
-                # dry-run; pin total to entry 0, zero the rest.
+                            "Use a registered table name from `agnes "
-                if dry_run_set:
+                            "catalog`, or write BQ-native SQL with full "
-                    b0, t0, _ = dry_run_set[0]
+                            "backtick paths. Pure DuckDB-only syntax is "
-                    dry_run_set[0] = (b0, t0, total_bytes)
+                            "not supported for --remote queries."
-                    for i in range(1, len(dry_run_set)):
+                        ),
-                        bi, ti, _ = dry_run_set[i]
+                        "underlying": exc2.message,
-                        dry_run_set[i] = (bi, ti, 0)
+                    })
            # Distribute the total to dry_run_set so the caller's
            # `record_bytes(sum(...))` stays correct. Per-table breakdown
            # is unavailable from a composite dry-run; pin total to entry
            # 0, zero the rest. (Same accounting symmetry whether the
            # bytes came from the rewritten SQL or the original-SQL
            # retry.)
            if dry_run_set:
                b0, t0, _ = dry_run_set[0]
                dry_run_set[0] = (b0, t0, total_bytes)
                for i in range(1, len(dry_run_set)):
                    bi, ti, _ = dry_run_set[i]
                    dry_run_set[i] = (bi, ti, 0)
            if cap_bytes > 0 and total_bytes > cap_bytes:
                tables = [f"{b}.{t}" for b, t, _ in dry_run_set]
--- a/tests/test_api_query_guardrail.py
+++ b/tests/test_api_query_guardrail.py
@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request(
    assert "`test-data-prj.marketing.traffic`" in state["last_sql"]
-def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
+def test_fallback_tries_original_sql_first(
    seeded_app, mock_dry_run, monkeypatch,
 ):
-    """When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only
+    """Issue #201 — when the rewriter produces SQL that BQ rejects with
-    syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …),
+    `bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the
-    the cap-guard falls back to the pre-#171 per-table SELECT * approach
+    cap-guard uses the original SQL's byte estimate. No more synthetic
-    so a non-portable query still gets a (loose) cap estimate instead of
+    `SELECT *` over-estimate.
-    fail-opening.
+
-    """
+    Bare-name reference populates `dry_run_set` so the cap-guard
    actually fires. Mock returns parse-error on the first call
    (rewritten SQL) and small bytes on the second (original)."""
    from connectors.bigquery.access import BqAccessError
    _register_bq_remote_row("ue", "finance", "ue")
@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
    def fake_dry_run(_bq, sql):
        state["calls"].append(sql)
-        # First call (rewritten user SQL) → BQ parse error.
+        # First call (rewritten SQL) → BQ parse error.
        if len(state["calls"]) == 1:
-            raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
+            raise BqAccessError("bq_bad_request", "Syntax error: simulated")
-        # Second call (fallback per-table SELECT *) → small bytes, pass cap.
+        # Second call (the user's original SQL) → small, passes cap.
        return 4096
    monkeypatch.setattr(
@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
-    # SQL with DuckDB-only `::INT` cast that BQ would reject.
+    user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'"
    r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
    # Two dry-runs: rewritten then original. No third synthetic-SELECT-*
    # call.
    assert len(state["calls"]) == 2, (
        f"expected rewritten + original-SQL retry, got "
        f"{len(state['calls'])}: {state['calls']}"
    )
    assert state["calls"][1] == user_sql, (
        f"second call must be the user's ORIGINAL SQL, got "
        f"{state['calls'][1]!r}"
    )
    # The response must NOT be remote_scan_too_large from a synthetic
    # over-estimate — 4096 bytes is well under the 5 GiB cap.
    if r.status_code == 400:
        detail = r.json().get("detail", {})
        if isinstance(detail, dict):
            assert detail.get("reason") != "remote_scan_too_large", detail
 def test_fallback_fails_fast_on_pure_duckdb_syntax(
    seeded_app, mock_dry_run, monkeypatch,
 ):
    """When BOTH the rewritten and original SQL fail with `bq_bad_request`
    (true DuckDB-only syntax like `::INT`), return HTTP 400
    `remote_estimate_failed` — never silently over-estimate via a
    synthetic `SELECT *`."""
    from connectors.bigquery.access import BqAccessError
    _register_bq_remote_row("ue", "finance", "ue")
    state = {"calls": []}
    def always_parse_error(_bq, sql):
        state["calls"].append(sql)
        raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
    monkeypatch.setattr(
        "app.api.query._bq_dry_run_bytes", always_parse_error, raising=False,
    )
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
        headers=_auth(token),
    )
-    # Two dry-runs (rewritten + fallback per-table) before the (failed)
+    # Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback.
    # execute. Status will be a downstream error from analytics.execute()
    # since `::INT` doesn't work in DuckDB either against a remote view —
    # but the GUARDRAIL must have completed without 5xx-ing.
    assert len(state["calls"]) == 2, (
-        f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: "
+        f"expected 1 rewritten + 1 original-retry, got "
-        f"{state['calls']}"
+        f"{len(state['calls'])}: {state['calls']}"
    )
-    assert "::" in state["calls"][0], "first call should be the rewritten user SQL"
+    # No call should be a synthetic ``SELECT * FROM `<project>...```. The
-    assert state["calls"][1].startswith("SELECT * FROM"), (
+    # original-SQL retry contains the user's SELECT clause.
-        "second call should be the per-table fallback"
+    for c_sql in state["calls"]:
-    )
+        # If a call is just a synthetic ``SELECT * FROM `<project>.<bucket>.<table>```
-    # Whatever HTTP status comes back must NOT be 502 from the guard's
+        # the user's `WHERE country = 'CZ'` would be missing.
-    # transport-error path — fallback must absorb the bq_bad_request.
+        if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql:
-    assert r.status_code != 502, r.json()
+            raise AssertionError(
                f"synthetic SELECT * fallback was used: {c_sql!r}"
            )
    assert r.status_code == 400, r.json()
    detail = r.json().get("detail", {})
    assert isinstance(detail, dict), detail
    assert detail.get("kind") == "remote_estimate_failed", detail
    assert "underlying" in detail, detail
    assert "agnes catalog" in detail.get("hint", "").lower() or \
           "backtick" in detail.get("hint", "").lower(), detail
 def test_guardrail_propagates_502_on_non_parse_bq_errors(