From c32be3fe96cad3bfae2b53be1d501ef5fb187ec9 Mon Sep 17 00:00:00 2001
From: ZdenekSrotyr <zdenek.srotyr@keboola.com>
Date: Wed, 6 May 2026 17:52:18 +0200
Subject: [PATCH] fix(query): cap-guard fallback retries original SQL, fails
 fast (#201)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When BQ rejects the rewritten dry-run SQL with `bq_bad_request`, the
cap-guard now retries with the user's ORIGINAL SQL instead of building
a synthetic `SELECT * FROM <table>` per registered table. The
synthetic path threw away user filters / projections / partition
predicates and routinely ballooned the estimate to "full table size",
falsely tripping `remote_scan_too_large` on legitimate narrow queries
(typical issue #201 trace: rewriter corrupts a backtick path → BQ
parse error → synthetic over-estimate → 400).

Behaviour:

- Rewritten SQL succeeds: same as before (issue #171 single-dry-run).
- Rewritten SQL parse-errors, original SQL succeeds: use original
  estimate. Common case for users submitting BQ-native input.
- Both fail with `bq_bad_request`: HTTP 400 `remote_estimate_failed`
  with a hint pointing at `agnes catalog` / BQ-native syntax. No
  silent over-estimate.
- Non-parse BQ error (forbidden, upstream): still 502 as before.

This is a behaviour change for clients matching error kinds — failure
to estimate scan size now surfaces as `remote_estimate_failed`
instead of being masked behind `remote_scan_too_large` from the
synthetic path.

Replaces the existing `test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error`
(which pinned the old contract) with `test_fallback_tries_original_sql_first`
and `test_fallback_fails_fast_on_pure_duckdb_syntax`.
---
 app/api/query.py                  | 124 +++++++++++++++++-------------
 tests/test_api_query_guardrail.py | 100 ++++++++++++++++++------
 2 files changed, 145 insertions(+), 79 deletions(-)
diff --git a/app/api/query.py b/app/api/query.py
index 5ab0d63..0da45d4 100644
--- a/app/api/query.py
+++ b/app/api/query.py
@@ -891,28 +891,32 @@ def _bq_quota_and_cap_guard(
     partitioned/clustered tables and rejecting narrow queries that BQ
     itself would dry-run as a few MB.
 
-    Fallback: if BQ rejects the rewritten SQL with a parse-level
-    ``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that
-    don't translate to BQ), fall back to the pre-#171 per-table
-    SELECT * approach so the cap-guard still functions — over-estimate
-    is preferred over fail-open. Forbidden / upstream errors still
-    propagate as HTTP 502.
+    Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level
+    ``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or
+    a rewriter bug that broke valid BQ-native input), retry with the
+    user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the
+    original ALSO fails, return a structured `remote_estimate_failed`
+    HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table
+    over-estimate. The synthetic fallback threw away user filters and
+    routinely ballooned to "full table size", blocking legitimate narrow
+    queries via `remote_scan_too_large`. Forbidden / upstream errors
+    still propagate as HTTP 502.
 
     Flow:
     1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work.
     2. `quota.acquire(user_id)` opened — concurrent-slot held throughout.
     3. Single dry-run of rewritten user SQL → `total_bytes`.
-       On parse error, fall back to per-table SELECT * → sum.
+       On parse error, retry with the user's original SQL.
+       On second parse error, raise 400 `remote_estimate_failed`.
     4. If total > cap → 400 `remote_scan_too_large`.
     5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
     6. On exit, slot released.
 
     Mutates `dry_run_set` in place: the third tuple element (bytes) is
     populated so the caller can sum and record bytes against the user's
-    quota post-flight. Single-dry-run path puts `total_bytes` on the first
-    entry and zero on the rest (BQ doesn't expose per-table bytes for a
-    composite query); the caller's `sum(b for _, _, b in dry_run_set)`
-    still equals `total_bytes`.
+    quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest
+    — BQ doesn't expose per-table bytes for a composite query — so
+    `sum(b for _, _, b in dry_run_set)` still equals `total_bytes`.
     """
     quota = _build_quota_tracker()
     try:
@@ -953,61 +957,71 @@ def _bq_quota_and_cap_guard(
                 sql, name_lookups, project,
             )
 
-            # Try the single-dry-run path first (issue #171). Falls back
-            # to the per-table SELECT * approach only on BQ parse errors
-            # (kind="bq_bad_request" — DuckDB-only syntax that BQ can't
-            # translate). All other BQ errors propagate as 502 below.
+            # Try the single-dry-run path first (issue #171). On BQ parse
+            # errors (`bq_bad_request` — typically DuckDB-only syntax the
+            # rewriter couldn't translate, OR — pre-#201 fix — a
+            # rewriter-corrupted backtick path) retry the user's ORIGINAL
+            # SQL: when the user submitted BQ-native SQL, the rewriter is
+            # the only thing standing between them and a clean dry-run.
+            # If the original ALSO fails, this is true DuckDB-only syntax
+            # that BQ cannot estimate — fail fast with a structured
+            # `remote_estimate_failed` instead of the pre-#201 synthetic
+            # `SELECT *` over-estimate (which threw away user filters and
+            # often ballooned to "full table size", blocking legitimate
+            # narrow queries via `remote_scan_too_large`).
+            #
+            # All other BQ errors (forbidden, upstream) propagate as 502.
             total_bytes = 0
-            used_fallback = False
             try:
                 total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
             except BqAccessError as exc:
-                if exc.kind == "bq_bad_request":
-                    logger.warning(
-                        "BQ dry-run rejected the rewritten SQL "
-                        "(kind=%s, message=%s). Falling back to per-table "
-                        "SELECT * estimate; the cap check will over-estimate "
-                        "scan bytes for this query. Consider rewriting to "
-                        "BQ-native syntax for a tight pre-check.",
-                        exc.kind, exc.message,
-                    )
-                    used_fallback = True
-                else:
+                if exc.kind != "bq_bad_request":
                     raise HTTPException(status_code=502, detail={
                         "kind": exc.kind,
                         "message": exc.message,
                         **(exc.details or {}),
                     })
-
-            if used_fallback:
-                # Pre-#171 path: estimate per registered table from a
-                # synthetic SELECT *. Over-estimates partitioned scans but
-                # never under-estimates, so the cap still bounds risk.
-                for i, (bucket, source_table, _) in enumerate(dry_run_set):
-                    fallback_sql = (
-                        f"SELECT * FROM `{project}.{bucket}.{source_table}`"
-                    )
-                    try:
-                        est = _bq_dry_run_bytes(bq, fallback_sql)
-                    except BqAccessError as exc:
+                logger.warning(
+                    "BQ dry-run rejected the rewritten SQL "
+                    "(kind=%s, message=%s). Retrying with the user's "
+                    "original SQL.",
+                    exc.kind, exc.message,
+                )
+                try:
+                    total_bytes = _bq_dry_run_bytes(bq, sql)
+                except BqAccessError as exc2:
+                    if exc2.kind != "bq_bad_request":
                         raise HTTPException(status_code=502, detail={
-                            "kind": exc.kind,
-                            "message": exc.message,
-                            **(exc.details or {}),
+                            "kind": exc2.kind,
+                            "message": exc2.message,
+                            **(exc2.details or {}),
                         })
-                    dry_run_set[i] = (bucket, source_table, est)
-                    total_bytes += est
-            else:
-                # Single-dry-run path. Distribute the total to dry_run_set
-                # so the caller's `record_bytes(sum(...))` stays correct.
-                # Per-table breakdown is unavailable from a composite
-                # dry-run; pin total to entry 0, zero the rest.
-                if dry_run_set:
-                    b0, t0, _ = dry_run_set[0]
-                    dry_run_set[0] = (b0, t0, total_bytes)
-                    for i in range(1, len(dry_run_set)):
-                        bi, ti, _ = dry_run_set[i]
-                        dry_run_set[i] = (bi, ti, 0)
+                    raise HTTPException(status_code=400, detail={
+                        "kind": "remote_estimate_failed",
+                        "message": (
+                            "Could not estimate scan size for this query."
+                        ),
+                        "hint": (
+                            "Use a registered table name from `agnes "
+                            "catalog`, or write BQ-native SQL with full "
+                            "backtick paths. Pure DuckDB-only syntax is "
+                            "not supported for --remote queries."
+                        ),
+                        "underlying": exc2.message,
+                    })
+
+            # Distribute the total to dry_run_set so the caller's
+            # `record_bytes(sum(...))` stays correct. Per-table breakdown
+            # is unavailable from a composite dry-run; pin total to entry
+            # 0, zero the rest. (Same accounting symmetry whether the
+            # bytes came from the rewritten SQL or the original-SQL
+            # retry.)
+            if dry_run_set:
+                b0, t0, _ = dry_run_set[0]
+                dry_run_set[0] = (b0, t0, total_bytes)
+                for i in range(1, len(dry_run_set)):
+                    bi, ti, _ = dry_run_set[i]
+                    dry_run_set[i] = (bi, ti, 0)
 
             if cap_bytes > 0 and total_bytes > cap_bytes:
                 tables = [f"{b}.{t}" for b, t, _ in dry_run_set]
diff --git a/tests/test_api_query_guardrail.py b/tests/test_api_query_guardrail.py
index afd75c9..05f8143 100644
--- a/tests/test_api_query_guardrail.py
+++ b/tests/test_api_query_guardrail.py
@@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request(
     assert "`test-data-prj.marketing.traffic`" in state["last_sql"]
 
 
-def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
+def test_fallback_tries_original_sql_first(
     seeded_app, mock_dry_run, monkeypatch,
 ):
-    """When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only
-    syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …),
-    the cap-guard falls back to the pre-#171 per-table SELECT * approach
-    so a non-portable query still gets a (loose) cap estimate instead of
-    fail-opening.
-    """
+    """Issue #201 — when the rewriter produces SQL that BQ rejects with
+    `bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the
+    cap-guard uses the original SQL's byte estimate. No more synthetic
+    `SELECT *` over-estimate.
+
+    Bare-name reference populates `dry_run_set` so the cap-guard
+    actually fires. Mock returns parse-error on the first call
+    (rewritten SQL) and small bytes on the second (original)."""
     from connectors.bigquery.access import BqAccessError
 
     _register_bq_remote_row("ue", "finance", "ue")
@@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
 
     def fake_dry_run(_bq, sql):
         state["calls"].append(sql)
-        # First call (rewritten user SQL) → BQ parse error.
+        # First call (rewritten SQL) → BQ parse error.
         if len(state["calls"]) == 1:
-            raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
-        # Second call (fallback per-table SELECT *) → small bytes, pass cap.
+            raise BqAccessError("bq_bad_request", "Syntax error: simulated")
+        # Second call (the user's original SQL) → small, passes cap.
         return 4096
 
     monkeypatch.setattr(
@@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
 
     c = seeded_app["client"]
     token = seeded_app["admin_token"]
-    # SQL with DuckDB-only `::INT` cast that BQ would reject.
+    user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'"
+    r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
+
+    # Two dry-runs: rewritten then original. No third synthetic-SELECT-*
+    # call.
+    assert len(state["calls"]) == 2, (
+        f"expected rewritten + original-SQL retry, got "
+        f"{len(state['calls'])}: {state['calls']}"
+    )
+    assert state["calls"][1] == user_sql, (
+        f"second call must be the user's ORIGINAL SQL, got "
+        f"{state['calls'][1]!r}"
+    )
+    # The response must NOT be remote_scan_too_large from a synthetic
+    # over-estimate — 4096 bytes is well under the 5 GiB cap.
+    if r.status_code == 400:
+        detail = r.json().get("detail", {})
+        if isinstance(detail, dict):
+            assert detail.get("reason") != "remote_scan_too_large", detail
+
+
+def test_fallback_fails_fast_on_pure_duckdb_syntax(
+    seeded_app, mock_dry_run, monkeypatch,
+):
+    """When BOTH the rewritten and original SQL fail with `bq_bad_request`
+    (true DuckDB-only syntax like `::INT`), return HTTP 400
+    `remote_estimate_failed` — never silently over-estimate via a
+    synthetic `SELECT *`."""
+    from connectors.bigquery.access import BqAccessError
+
+    _register_bq_remote_row("ue", "finance", "ue")
+
+    state = {"calls": []}
+
+    def always_parse_error(_bq, sql):
+        state["calls"].append(sql)
+        raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
+
+    monkeypatch.setattr(
+        "app.api.query._bq_dry_run_bytes", always_parse_error, raising=False,
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
     r = c.post(
         "/api/query",
         json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
         headers=_auth(token),
     )
 
-    # Two dry-runs (rewritten + fallback per-table) before the (failed)
-    # execute. Status will be a downstream error from analytics.execute()
-    # since `::INT` doesn't work in DuckDB either against a remote view —
-    # but the GUARDRAIL must have completed without 5xx-ing.
+    # Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback.
     assert len(state["calls"]) == 2, (
-        f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: "
-        f"{state['calls']}"
+        f"expected 1 rewritten + 1 original-retry, got "
+        f"{len(state['calls'])}: {state['calls']}"
     )
-    assert "::" in state["calls"][0], "first call should be the rewritten user SQL"
-    assert state["calls"][1].startswith("SELECT * FROM"), (
-        "second call should be the per-table fallback"
-    )
-    # Whatever HTTP status comes back must NOT be 502 from the guard's
-    # transport-error path — fallback must absorb the bq_bad_request.
-    assert r.status_code != 502, r.json()
+    # No call should be a synthetic ``SELECT * FROM `<project>...```. The
+    # original-SQL retry contains the user's SELECT clause.
+    for c_sql in state["calls"]:
+        # If a call is just a synthetic ``SELECT * FROM `<project>.<bucket>.<table>```
+        # the user's `WHERE country = 'CZ'` would be missing.
+        if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql:
+            raise AssertionError(
+                f"synthetic SELECT * fallback was used: {c_sql!r}"
+            )
+
+    assert r.status_code == 400, r.json()
+    detail = r.json().get("detail", {})
+    assert isinstance(detail, dict), detail
+    assert detail.get("kind") == "remote_estimate_failed", detail
+    assert "underlying" in detail, detail
+    assert "agnes catalog" in detail.get("hint", "").lower() or \
+           "backtick" in detail.get("hint", "").lower(), detail
 
 
 def test_guardrail_propagates_502_on_non_parse_bq_errors(