Merge pull request #208 from keboola/zs/issue-201-rewriter-backtick

fix(query): rewriter respects backtick paths; tighten cap-guard fallback (#201)
2026-05-06 23:09:43 +02:00 · 2026-05-06 23:09:43 +02:00 · e3494607bf
commit e3494607bf
parent 1b49de1568 bc55af6e88
5 changed files with 586 additions and 87 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,40 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
 ## [Unreleased]
 ## [0.42.0] — 2026-05-06
 ### Fixed
 - `agnes query --remote`: full backtick BigQuery paths in user SQL are no
  longer corrupted by the registered-name rewriter. Previously a query
  like ``SELECT … FROM `<project>.<dataset>.<table>` WHERE …`` whose
  table name happened to be registered as a bare-name alias would have
  the alias re-substituted *inside* the backtick path, producing
  malformed SQL that BigQuery rejected with a parse error. The cap-guard
  then fell back to a filter-less `SELECT *` size estimate (often orders
  of magnitude larger than the real scan), blocking the query as
  `remote_scan_too_large`. Issue #201.
 ### Changed
 - `agnes query --remote`: cap-guard fallback no longer estimates from
  a synthetic `SELECT *` when the rewritten SQL fails dry-run. It first
  retries the user's original SQL (handles BQ-native input cleanly), and
  only when *that* also fails returns a structured `remote_estimate_failed`
  HTTP 400 with a hint instead of silently over-estimating.
 - **BREAKING (clients matching error kinds)**: failure to estimate
  remote-query scan size now returns `kind="remote_estimate_failed"`
  instead of being masked as `remote_scan_too_large` caused by
  over-estimation. Operators that grep for the old kind in dashboards
  should update.
 ### Security
 - `agnes query --remote`: full backtick BigQuery paths are now
  registry-gated identically to `bq."<dataset>"."<table>"` syntax.
  Previously, full backtick paths bypassed Agnes RBAC entirely — only
  the configured service account scope limited what users could query.
  New `bq_path_cross_project` (when the project ≠ configured data
  project) and `bq_path_not_registered` (when path is unknown) error
  kinds. Issue #201.
 ## [0.41.0] — 2026-05-06
 ### Fixed
--- a/app/api/query.py
+++ b/app/api/query.py
@ -90,6 +90,29 @@ BQ_PATH = re.compile(
 )
 # Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
 # SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
 # bare-name regexes from firing inside backtick-quoted segments.
 _BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
 _BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
 def _mask_backticks(sql: str) -> str:
    """Replace each `…`-quoted segment with spaces of equal length so
    word-boundary regexes find positions outside backticks but ignore
    everything inside. Preserves all character offsets so ``re.search``
    on the masked string returns matches at the same positions as on the
    original.
    Issue #201: `\\b` matches inside backtick segments because both `.`
    and `` ` `` are non-word characters. A registered bare-name like
    ``unit_economics`` would otherwise match inside a user-supplied full
    backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
    falsely rewritten — corrupting the user's intended SQL.
    """
    return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
 def _default_remote_query_cap_bytes() -> int:
    """5 GiB default cap on /api/query BQ-touching scans. Configurable via
    `data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
@ -197,11 +220,18 @@ def execute_query(
                if r.get("name") and r.get("id") in allowed_ids
            }
-            # Check if query references any forbidden tables (word-boundary match)
+            # Check if query references any forbidden tables (word-boundary
            # match). Issue #201: mask backtick segments so `\b` doesn't
            # falsely fire inside a user-supplied full backtick path like
            # `<project>.<dataset>.<table>` whose final segment happens to
            # collide with a forbidden master view name. The full-path
            # registry-gate downstream is the proper authorization check
            # for those.
            sql_lower_masked = _mask_backticks(sql_lower)
            forbidden = all_views - allowed_view_names
            for table in forbidden:
                pattern = r'\b' + re.escape(table.lower()) + r'\b'
-                if re.search(pattern, sql_lower):
+                if re.search(pattern, sql_lower_masked):
                    raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
        # ---- #160 BQ remote-row guardrail + RBAC patch -------------------
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
    name_lookups: list = []
    seen_paths: set = set()
    accessible_set = set(allowed) if allowed is not None else None
    # Issue #201: mask backtick segments so a registered bare name like
    # `unit_economics` doesn't false-positive on a user-supplied full
    # backtick path `<project>.<dataset>.unit_economics`. The full-path
    # pass below registry-gates those properly.
    sql_lower_masked = _mask_backticks(sql_lower)
    for r in repo.list_by_source("bigquery"):
        if (r.get("query_mode") or "") != "remote":
            continue
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
            # before we get here. Defensive skip.
            continue
        pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
-        if re.search(pattern, sql_lower):
+        if re.search(pattern, sql_lower_masked):
            key = (bucket.lower(), source_table.lower())
            if key not in seen_paths:
                seen_paths.add(key)
@ -529,6 +564,66 @@ def _bq_guardrail_inputs(
                seen_paths.add(key)
                dry_run.append((bucket, source_table, 0))
    # 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
    # Pre-#201 these bypassed Agnes RBAC entirely — only the configured
    # service account scope limited which tables a user could reach. Gate
    # them identically to the `bq.<ds>.<tbl>` pass: must match the
    # configured data project, must point at a registered row, and the
    # caller must hold a grant on that row's id (admin bypasses the grant
    # check but still requires registration + project match).
    #
    # Lazy `get_bq_access()` import via the module-level alias so tests
    # can monkeypatch a fake. When BQ isn't configured (no data project),
    # fall through silently — full backtick paths can't possibly resolve
    # against this instance, so leave them to BQ to reject if a query
    # somehow makes it through.
    try:
        bq = get_bq_access()
        data_project = (bq.projects.data or "").strip()
    except Exception:
        data_project = ""
    if data_project:
        for m in _BACKTICK_FULL_PATH.finditer(sql):
            proj, ds, tbl = m.group(1), m.group(2), m.group(3)
            if proj.lower() != data_project.lower():
                return [], [], {
                    "reason": "bq_path_cross_project",
                    "path": f"`{proj}.{ds}.{tbl}`",
                    "expected_project": data_project,
                    "hint": (
                        "--remote queries can only reference tables in the "
                        "configured BigQuery data project. Register "
                        "cross-project tables via `agnes admin "
                        "register-table` if needed."
                    ),
                }
            row = repo.find_by_bq_path(ds, tbl)
            if row is None:
                return [], [], {
                    "reason": "bq_path_not_registered",
                    "path": f"`{proj}.{ds}.{tbl}`",
                    "hint": (
                        "Direct BigQuery paths must point to a registered "
                        "table. Register via `agnes admin register-table` "
                        "or use the registered name from `agnes catalog`."
                    ),
                }
            if not is_admin:
                if accessible_set is None or row["id"] not in accessible_set:
                    return [], [], {
                        "reason": "bq_path_access_denied",
                        "path": f"`{proj}.{ds}.{tbl}`",
                        "registered_as": row["name"],
                    }
            bucket = row["bucket"]
            source_table = row["source_table"]
            if bucket and source_table:
                key = (bucket.lower(), source_table.lower())
                if key not in seen_paths:
                    seen_paths.add(key)
                    dry_run.append((bucket, source_table, 0))
    return dry_run, name_lookups, None
@ -579,6 +674,15 @@ def _rewrite_bq_table_refs_to_native(
    # name up in a case-insensitive dict. Single-pass means freshly
    # inserted backticked text isn't re-scanned, fixing the
    # project-ID-contains-name corruption (Devin Review on query.py:464).
    #
    # Issue #201: split the SQL on `…` segments and rewrite ONLY in the
    # outside-backtick chunks. Without this, a user-supplied full backtick
    # path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
    # segment matches a registered bare name would have the bare-name
    # regex fire INSIDE the backticks (since `\\b` treats both `.` and
    # `` ` `` as non-word boundaries), producing malformed nested
    # backticks. Splitting confines the rewrite to user identifier
    # positions where bare-name resolution is the intended behaviour.
    if name_lookups:
        # Map name (lower-cased) → backticked target. Names are
        # case-insensitive on the input side per the existing helper
@ -598,7 +702,15 @@ def _rewrite_bq_table_refs_to_native(
        def _name_repl(m: re.Match) -> str:
            return name_to_target[m.group(1).lower()]
-        out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
+        # `re.split` with a captured group returns: [outside, backtick,
        # outside, backtick, …]. Even indices are outside-backtick chunks
        # eligible for bare-name rewrite; odd indices are full backtick
        # segments preserved verbatim.
        parts = re.split(r'(`[^`]*`)', out)
        for i, part in enumerate(parts):
            if i % 2 == 0:
                parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
        out = "".join(parts)
    # Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
    def _bq_path_repl(m: re.Match) -> str:
@ -675,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
        return user_sql, False
    # Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
-    # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`.
+    # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
    # bare-name regex must run against a backtick-masked copy so a
    # registered name like ``orders`` doesn't false-positive when it
    # appears as the table segment of a user-supplied full backtick path
    # like ``\\`<project>.<dataset>.orders\\```. Without masking, the
    # cross-source check below would falsely conclude the SQL touches
    # both BQ-remote and local sources, dropping every backtick-path
    # query into the 50-100× slower ATTACH-catalog fallback. Devin
    # Review on PR #208.
    sql_lower = user_sql.lower()
    sql_lower_masked = _mask_backticks(sql_lower)
    name_lookups: list = []
    seen_paths: set = set()
@ -715,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
            # mix rewritten and non-rewritten BQ paths in one query.
            return user_sql, False
        pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
-        if re.search(pattern, sql_lower):
+        if re.search(pattern, sql_lower_masked):
            key = (bucket.lower(), source_table.lower())
            if key not in seen_paths:
                seen_paths.add(key)
@ -752,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
            # Same name registered both BQ-remote and local? Pathological;
            # skip as a safety measure.
            return user_sql, False
-        if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower):
+        if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
            logger.info(
                "rewrite_skip_cross_source: user SQL references both "
                "BQ-remote and local-mode tables; falling back to "
@ -839,28 +960,32 @@ def _bq_quota_and_cap_guard(
    partitioned/clustered tables and rejecting narrow queries that BQ
    itself would dry-run as a few MB.
-    Fallback: if BQ rejects the rewritten SQL with a parse-level
+    Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level
-    ``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that
+    ``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or
-    don't translate to BQ), fall back to the pre-#171 per-table
+    a rewriter bug that broke valid BQ-native input), retry with the
-    SELECT * approach so the cap-guard still functions — over-estimate
+    user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the
-    is preferred over fail-open. Forbidden / upstream errors still
+    original ALSO fails, return a structured `remote_estimate_failed`
-    propagate as HTTP 502.
+    HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table
    over-estimate. The synthetic fallback threw away user filters and
    routinely ballooned to "full table size", blocking legitimate narrow
    queries via `remote_scan_too_large`. Forbidden / upstream errors
    still propagate as HTTP 502.
    Flow:
    1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work.
    2. `quota.acquire(user_id)` opened — concurrent-slot held throughout.
    3. Single dry-run of rewritten user SQL → `total_bytes`.
-       On parse error, fall back to per-table SELECT * → sum.
+       On parse error, retry with the user's original SQL.
       On second parse error, raise 400 `remote_estimate_failed`.
    4. If total > cap → 400 `remote_scan_too_large`.
    5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
    6. On exit, slot released.
    Mutates `dry_run_set` in place: the third tuple element (bytes) is
    populated so the caller can sum and record bytes against the user's
-    quota post-flight. Single-dry-run path puts `total_bytes` on the first
+    quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest
-    entry and zero on the rest (BQ doesn't expose per-table bytes for a
+    — BQ doesn't expose per-table bytes for a composite query — so
-    composite query); the caller's `sum(b for _, _, b in dry_run_set)`
+    `sum(b for _, _, b in dry_run_set)` still equals `total_bytes`.
    still equals `total_bytes`.
    """
    quota = _build_quota_tracker()
    try:
@ -901,55 +1026,65 @@ def _bq_quota_and_cap_guard(
                sql, name_lookups, project,
            )
-            # Try the single-dry-run path first (issue #171). Falls back
+            # Try the single-dry-run path first (issue #171). On BQ parse
-            # to the per-table SELECT * approach only on BQ parse errors
+            # errors (`bq_bad_request` — typically DuckDB-only syntax the
-            # (kind="bq_bad_request" — DuckDB-only syntax that BQ can't
+            # rewriter couldn't translate, OR — pre-#201 fix — a
-            # translate). All other BQ errors propagate as 502 below.
+            # rewriter-corrupted backtick path) retry the user's ORIGINAL
            # SQL: when the user submitted BQ-native SQL, the rewriter is
            # the only thing standing between them and a clean dry-run.
            # If the original ALSO fails, this is true DuckDB-only syntax
            # that BQ cannot estimate — fail fast with a structured
            # `remote_estimate_failed` instead of the pre-#201 synthetic
            # `SELECT *` over-estimate (which threw away user filters and
            # often ballooned to "full table size", blocking legitimate
            # narrow queries via `remote_scan_too_large`).
            #
            # All other BQ errors (forbidden, upstream) propagate as 502.
            total_bytes = 0
            used_fallback = False
            try:
                total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
            except BqAccessError as exc:
-                if exc.kind == "bq_bad_request":
+                if exc.kind != "bq_bad_request":
                    raise HTTPException(status_code=502, detail={
                        "kind": exc.kind,
                        "message": exc.message,
                        **(exc.details or {}),
                    })
                logger.warning(
                    "BQ dry-run rejected the rewritten SQL "
-                        "(kind=%s, message=%s). Falling back to per-table "
+                    "(kind=%s, message=%s). Retrying with the user's "
-                        "SELECT * estimate; the cap check will over-estimate "
+                    "original SQL.",
                        "scan bytes for this query. Consider rewriting to "
                        "BQ-native syntax for a tight pre-check.",
                    exc.kind, exc.message,
                )
-                    used_fallback = True
+                try:
-                else:
+                    total_bytes = _bq_dry_run_bytes(bq, sql)
                except BqAccessError as exc2:
                    if exc2.kind != "bq_bad_request":
                        raise HTTPException(status_code=502, detail={
-                        "kind": exc.kind,
+                            "kind": exc2.kind,
-                        "message": exc.message,
+                            "message": exc2.message,
-                        **(exc.details or {}),
+                            **(exc2.details or {}),
                        })
                    raise HTTPException(status_code=400, detail={
                        "kind": "remote_estimate_failed",
                        "message": (
                            "Could not estimate scan size for this query."
                        ),
                        "hint": (
                            "Use a registered table name from `agnes "
                            "catalog`, or write BQ-native SQL with full "
                            "backtick paths. Pure DuckDB-only syntax is "
                            "not supported for --remote queries."
                        ),
                        "underlying": exc2.message,
                    })
-            if used_fallback:
+            # Distribute the total to dry_run_set so the caller's
-                # Pre-#171 path: estimate per registered table from a
+            # `record_bytes(sum(...))` stays correct. Per-table breakdown
-                # synthetic SELECT *. Over-estimates partitioned scans but
+            # is unavailable from a composite dry-run; pin total to entry
-                # never under-estimates, so the cap still bounds risk.
+            # 0, zero the rest. (Same accounting symmetry whether the
-                for i, (bucket, source_table, _) in enumerate(dry_run_set):
+            # bytes came from the rewritten SQL or the original-SQL
-                    fallback_sql = (
+            # retry.)
                        f"SELECT * FROM `{project}.{bucket}.{source_table}`"
                    )
                    try:
                        est = _bq_dry_run_bytes(bq, fallback_sql)
                    except BqAccessError as exc:
                        raise HTTPException(status_code=502, detail={
                            "kind": exc.kind,
                            "message": exc.message,
                            **(exc.details or {}),
                        })
                    dry_run_set[i] = (bucket, source_table, est)
                    total_bytes += est
            else:
                # Single-dry-run path. Distribute the total to dry_run_set
                # so the caller's `record_bytes(sum(...))` stays correct.
                # Per-table breakdown is unavailable from a composite
                # dry-run; pin total to entry 0, zero the rest.
            if dry_run_set:
                b0, t0, _ = dry_run_set[0]
                dry_run_set[0] = (b0, t0, total_bytes)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "agnes-the-ai-analyst"
-version = "0.41.0"
+version = "0.42.0"
 description = "Agnes — AI Data Analyst platform for AI analytical systems"
 requires-python = ">=3.11,<3.14"
 license = "MIT"
--- a/tests/test_api_query_guardrail.py
+++ b/tests/test_api_query_guardrail.py
@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request(
    assert "`test-data-prj.marketing.traffic`" in state["last_sql"]
-def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
+def test_fallback_tries_original_sql_first(
    seeded_app, mock_dry_run, monkeypatch,
 ):
-    """When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only
+    """Issue #201 — when the rewriter produces SQL that BQ rejects with
-    syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …),
+    `bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the
-    the cap-guard falls back to the pre-#171 per-table SELECT * approach
+    cap-guard uses the original SQL's byte estimate. No more synthetic
-    so a non-portable query still gets a (loose) cap estimate instead of
+    `SELECT *` over-estimate.
-    fail-opening.
+
-    """
+    Bare-name reference populates `dry_run_set` so the cap-guard
    actually fires. Mock returns parse-error on the first call
    (rewritten SQL) and small bytes on the second (original)."""
    from connectors.bigquery.access import BqAccessError
    _register_bq_remote_row("ue", "finance", "ue")
@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
    def fake_dry_run(_bq, sql):
        state["calls"].append(sql)
-        # First call (rewritten user SQL) → BQ parse error.
+        # First call (rewritten SQL) → BQ parse error.
        if len(state["calls"]) == 1:
-            raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
+            raise BqAccessError("bq_bad_request", "Syntax error: simulated")
-        # Second call (fallback per-table SELECT *) → small bytes, pass cap.
+        # Second call (the user's original SQL) → small, passes cap.
        return 4096
    monkeypatch.setattr(
@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
-    # SQL with DuckDB-only `::INT` cast that BQ would reject.
+    user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'"
    r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
    # Two dry-runs: rewritten then original. No third synthetic-SELECT-*
    # call.
    assert len(state["calls"]) == 2, (
        f"expected rewritten + original-SQL retry, got "
        f"{len(state['calls'])}: {state['calls']}"
    )
    assert state["calls"][1] == user_sql, (
        f"second call must be the user's ORIGINAL SQL, got "
        f"{state['calls'][1]!r}"
    )
    # The response must NOT be remote_scan_too_large from a synthetic
    # over-estimate — 4096 bytes is well under the 5 GiB cap.
    if r.status_code == 400:
        detail = r.json().get("detail", {})
        if isinstance(detail, dict):
            assert detail.get("reason") != "remote_scan_too_large", detail
 def test_fallback_fails_fast_on_pure_duckdb_syntax(
    seeded_app, mock_dry_run, monkeypatch,
 ):
    """When BOTH the rewritten and original SQL fail with `bq_bad_request`
    (true DuckDB-only syntax like `::INT`), return HTTP 400
    `remote_estimate_failed` — never silently over-estimate via a
    synthetic `SELECT *`."""
    from connectors.bigquery.access import BqAccessError
    _register_bq_remote_row("ue", "finance", "ue")
    state = {"calls": []}
    def always_parse_error(_bq, sql):
        state["calls"].append(sql)
        raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
    monkeypatch.setattr(
        "app.api.query._bq_dry_run_bytes", always_parse_error, raising=False,
    )
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
        headers=_auth(token),
    )
-    # Two dry-runs (rewritten + fallback per-table) before the (failed)
+    # Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback.
    # execute. Status will be a downstream error from analytics.execute()
    # since `::INT` doesn't work in DuckDB either against a remote view —
    # but the GUARDRAIL must have completed without 5xx-ing.
    assert len(state["calls"]) == 2, (
-        f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: "
+        f"expected 1 rewritten + 1 original-retry, got "
-        f"{state['calls']}"
+        f"{len(state['calls'])}: {state['calls']}"
    )
-    assert "::" in state["calls"][0], "first call should be the rewritten user SQL"
+    # No call should be a synthetic ``SELECT * FROM `<project>...```. The
-    assert state["calls"][1].startswith("SELECT * FROM"), (
+    # original-SQL retry contains the user's SELECT clause.
-        "second call should be the per-table fallback"
+    for c_sql in state["calls"]:
        # If a call is just a synthetic ``SELECT * FROM `<project>.<bucket>.<table>```
        # the user's `WHERE country = 'CZ'` would be missing.
        if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql:
            raise AssertionError(
                f"synthetic SELECT * fallback was used: {c_sql!r}"
            )
-    # Whatever HTTP status comes back must NOT be 502 from the guard's
+
-    # transport-error path — fallback must absorb the bq_bad_request.
+    assert r.status_code == 400, r.json()
-    assert r.status_code != 502, r.json()
+    detail = r.json().get("detail", {})
    assert isinstance(detail, dict), detail
    assert detail.get("kind") == "remote_estimate_failed", detail
    assert "underlying" in detail, detail
    assert "agnes catalog" in detail.get("hint", "").lower() or \
           "backtick" in detail.get("hint", "").lower(), detail
 def test_guardrail_propagates_502_on_non_parse_bq_errors(
@ -432,3 +484,243 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
    )
    assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
           rewritten.lower().count("`p.fin.ue`") == 2
 # ---------------------------------------------------------------------------
 # Issue #201: rewriter must NOT touch text inside `…` backtick segments.
 # A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
 # table segment matches a registered bare name was being re-substituted
 # inside the backticks, producing malformed nested-backtick SQL that BQ
 # rejected with a parse error.
 # ---------------------------------------------------------------------------
 def test_rewrite_skips_inside_backtick_path():
    """Full backtick BQ path is preserved byte-for-byte even when its
    final segment matches a registered bare-name alias."""
    from app.api.query import _rewrite_user_sql_for_bq_dry_run
    sql = (
        "SELECT * FROM `my-prj.finance.unit_economics` "
        "WHERE country = 'CZ'"
    )
    rewritten = _rewrite_user_sql_for_bq_dry_run(
        sql=sql,
        name_lookups=[("unit_economics", "finance", "unit_economics")],
        project="my-prj",
    )
    # No corruption — input is already BQ-native, rewriter is a no-op here.
    assert rewritten == sql, (
        f"backtick path was rewritten:\n  in : {sql!r}\n  out: {rewritten!r}"
    )
    # Sanity: the malformed nested form must NOT appear.
    assert "`my-prj.finance.`my-prj" not in rewritten
 def test_rewrite_skips_inside_backtick_with_outside_bare_name():
    """Mixed SQL: a bare name outside backticks is rewritten as before,
    but an identically-named segment inside a backtick path is left
    alone."""
    from app.api.query import _rewrite_user_sql_for_bq_dry_run
    sql = (
        "SELECT a.id, b.col FROM ue a "
        "JOIN `my-prj.finance.ue` b ON a.id = b.id"
    )
    rewritten = _rewrite_user_sql_for_bq_dry_run(
        sql=sql,
        name_lookups=[("ue", "fin_alias", "ue_alias")],
        project="my-prj",
    )
    # Outside-backtick `ue` rewrites to the registered alias path.
    assert "`my-prj.fin_alias.ue_alias`" in rewritten
    # The user-supplied backtick path is preserved verbatim.
    assert "`my-prj.finance.ue`" in rewritten
    # The malformed nested form must NOT appear.
    assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
 def test_guardrail_skips_bare_name_match_inside_backticks(
    seeded_app, mock_dry_run, monkeypatch,
 ):
    """The `name_lookups` collection populated by `_bq_guardrail_inputs`
    must not include a registered name when the only place that name
    appears in the SQL is inside a `…` backtick segment.
    Captures the rewritten SQL the guardrail forwards to the dry-run and
    asserts the bare-name was NOT substituted inside the user's backtick
    path.
    """
    _register_bq_remote_row("unit_economics", "finance", "unit_economics")
    captured = {"sql": None}
    def capturing_fake(_bq, sql):
        captured["sql"] = sql
        return 1024
    monkeypatch.setattr(
        "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
    )
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    user_sql = (
        "SELECT * FROM `test-data-prj.finance.unit_economics` "
        "WHERE country = 'CZ'"
    )
    c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
    sent = captured["sql"]
    if sent is None:
        # Guardrail decided no BQ tables were referenced — that's also
        # an acceptable "no false-positive" outcome (Layer 3 will cover
        # the explicit registry check for full backtick paths). We just
        # need to ensure the bare-name regex didn't fire.
        return
    # The user's exact backtick path must survive verbatim — no nested
    # backticks introduced by a stray bare-name rewrite.
    assert "`test-data-prj.finance.unit_economics`" in sent, (
        f"backtick path corrupted by guardrail:\n  out: {sent!r}"
    )
    assert "`test-data-prj.finance.`test-data-prj" not in sent, (
        f"nested-backtick corruption signature present: {sent!r}"
    )
 # ---------------------------------------------------------------------------
 # Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
 # Pre-fix these bypassed Agnes RBAC entirely — only the configured service
 # account scope limited which tables a user could reach. Post-fix, they're
 # treated identically to `bq."<dataset>"."<table>"` syntax.
 # ---------------------------------------------------------------------------
 def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
    """Full backtick path to an unregistered `<dataset>.<table>` (project
    matches the configured data project) → HTTP 403 with
    `bq_path_not_registered`."""
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={
            "sql": (
                "SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
                "WHERE country = 'CZ'"
            ),
        },
        headers=_auth(token),
    )
    assert r.status_code == 403, r.json()
    detail = r.json().get("detail", {})
    assert isinstance(detail, dict), detail
    assert detail.get("reason") == "bq_path_not_registered", detail
    assert "secret_ds" in detail.get("path", ""), detail
    assert "secret_tbl" in detail.get("path", ""), detail
 def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
    """Full backtick path with project ≠ configured data project → HTTP
    403 with `bq_path_cross_project`. Even if the path happens to point
    at a registered (bucket, source_table), the project mismatch is the
    primary boundary."""
    _register_bq_remote_row("ue", "finance", "ue")
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={
            "sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
        },
        headers=_auth(token),
    )
    assert r.status_code == 403, r.json()
    detail = r.json().get("detail", {})
    assert isinstance(detail, dict), detail
    assert detail.get("reason") == "bq_path_cross_project", detail
    assert detail.get("expected_project") == "test-data-prj", detail
    assert "other-project" in detail.get("path", ""), detail
 def test_full_backtick_path_registered_admin_passes(
    seeded_app, mock_dry_run, monkeypatch,
 ):
    """Admin caller + registered path + matching project → no RBAC
    rejection. The dry-run fires (we can capture the SQL the guardrail
    forwards) and no `bq_path_*` reason appears in any error response."""
    _register_bq_remote_row("ue", "finance", "ue")
    captured = {"sql": None}
    def capturing_fake(_bq, sql):
        captured["sql"] = sql
        return 1024  # tiny — pass cap
    monkeypatch.setattr(
        "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
    )
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={
            "sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
        },
        headers=_auth(token),
    )
    # If 403, must NOT be the issue-#201 bq_path_* reasons.
    if r.status_code == 403:
        detail = r.json().get("detail", {})
        if isinstance(detail, dict):
            assert detail.get("reason") not in (
                "bq_path_not_registered",
                "bq_path_access_denied",
                "bq_path_cross_project",
            ), f"admin + registered path should pass RBAC: {detail}"
    # The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
    # and the cap-guard fired. The user's WHERE clause must still be in
    # the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
    assert captured["sql"] is not None, (
        "dry-run never fired — Pass 3 may not have registered the path"
    )
    assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
    assert "WHERE id = 1" in captured["sql"], captured["sql"]
 def test_full_backtick_path_inside_string_literal_not_gated(
    seeded_app, mock_dry_run,
 ):
    """Defensive case: a backtick path appearing inside a SQL string
    literal (rare but possible) should not trigger Pass 3. Practically
    this is unreachable because backticks aren't typically valid inside
    BQ string literals — but the regex doesn't know that. We document
    that the gate applies to ALL backtick triples to be safe; users who
    really need a literal can use single-quoted strings without
    backticks."""
    # No registration; the test confirms an unregistered path inside
    # what looks like a string is still gated. This is the conservative
    # boundary — false-positive on string literal beats false-negative
    # on a real RBAC bypass.
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={
            "sql": (
                "SELECT 'matches `test-data-prj.x.y`' AS lit"
            ),
        },
        headers=_auth(token),
    )
    # Either gated (403) or 200 if the analytics DB happens to evaluate
    # the literal — both are acceptable. The point is no silent RBAC
    # bypass: if the response is 200, no BQ table was reached.
    if r.status_code == 403:
        detail = r.json().get("detail", {})
        if isinstance(detail, dict):
            assert detail.get("reason") in (
                "bq_path_not_registered",
                "bq_path_cross_project",
            ), detail
--- a/tests/test_query_remote_rewrite.py
+++ b/tests/test_query_remote_rewrite.py
@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
    assert rewritten == user_sql  # untouched
 def test_local_name_inside_backtick_path_does_not_trip_cross_source(
    seeded_registry, monkeypatch,
 ):
    """Devin Review on PR #208 (issue #201 follow-up): a registered
    LOCAL-mode table name appearing as a segment of a user-supplied full
    backtick BQ path must NOT trip the cross-source guard. Pre-fix the
    bare-name regex at the cross-source check ran against unmasked
    sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
    local ``orders`` inside the backticks and force the wrapper to bail
    to the ATTACH-catalog slow path (50-100× slower). Post-fix the
    regex runs against the backtick-masked copy, the cross-source check
    correctly sees only BQ refs, and the wrap proceeds.
    """
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
                        bucket="fin", source_table="ue")
    _register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
    _set_bq_project(monkeypatch, "test-prj")
    user_sql = (
        "SELECT u.id "
        "FROM ue u "
        "JOIN `test-prj.dataset.orders` o ON u.x = o.x "
        "WHERE o.value > 0"
    )
    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        user_sql, seeded_registry,
    )
    # Must wrap — both refs are BQ; the local `orders` registration is
    # irrelevant to a query that touches only BQ paths.
    assert did_rewrite is True
    assert "bigquery_query(" in rewritten
    # The user's backtick path is preserved verbatim inside the wrapped
    # inner SQL (Layer 1 split-on-backticks behaviour), so the original
    # `test-prj.dataset.orders` reference survives.
    assert "test-prj.dataset.orders" in rewritten
 def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
    """User SQL referencing only local-source tables → no rewrite,
    no log spam, original SQL returned."""