Merge pull request #208 from keboola/zs/issue-201-rewriter-backtick

fix(query): rewriter respects backtick paths; tighten cap-guard fallback (#201)
2026-05-06 23:09:43 +02:00 · 2026-05-06 23:09:43 +02:00 · e3494607bf
commit e3494607bf
parent 1b49de1568 bc55af6e88
5 changed files with 586 additions and 87 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,40 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C

 ## [Unreleased]

+## [0.42.0] — 2026-05-06
+
+### Fixed
+- `agnes query --remote`: full backtick BigQuery paths in user SQL are no
+  longer corrupted by the registered-name rewriter. Previously a query
+  like ``SELECT … FROM `<project>.<dataset>.<table>` WHERE …`` whose
+  table name happened to be registered as a bare-name alias would have
+  the alias re-substituted *inside* the backtick path, producing
+  malformed SQL that BigQuery rejected with a parse error. The cap-guard
+  then fell back to a filter-less `SELECT *` size estimate (often orders
+  of magnitude larger than the real scan), blocking the query as
+  `remote_scan_too_large`. Issue #201.
+
+### Changed
+- `agnes query --remote`: cap-guard fallback no longer estimates from
+  a synthetic `SELECT *` when the rewritten SQL fails dry-run. It first
+  retries the user's original SQL (handles BQ-native input cleanly), and
+  only when *that* also fails returns a structured `remote_estimate_failed`
+  HTTP 400 with a hint instead of silently over-estimating.
+- **BREAKING (clients matching error kinds)**: failure to estimate
+  remote-query scan size now returns `kind="remote_estimate_failed"`
+  instead of being masked as `remote_scan_too_large` caused by
+  over-estimation. Operators that grep for the old kind in dashboards
+  should update.
+
+### Security
+- `agnes query --remote`: full backtick BigQuery paths are now
+  registry-gated identically to `bq."<dataset>"."<table>"` syntax.
+  Previously, full backtick paths bypassed Agnes RBAC entirely — only
+  the configured service account scope limited what users could query.
+  New `bq_path_cross_project` (when the project ≠ configured data
+  project) and `bq_path_not_registered` (when path is unknown) error
+  kinds. Issue #201.
+
 ## [0.41.0] — 2026-05-06

 ### Fixed
--- a/app/api/query.py
+++ b/app/api/query.py
@ -90,6 +90,29 @@ BQ_PATH = re.compile(
 )


+# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
+# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
+# bare-name regexes from firing inside backtick-quoted segments.
+_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
+_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
+
+
+def _mask_backticks(sql: str) -> str:
+    """Replace each `…`-quoted segment with spaces of equal length so
+    word-boundary regexes find positions outside backticks but ignore
+    everything inside. Preserves all character offsets so ``re.search``
+    on the masked string returns matches at the same positions as on the
+    original.
+
+    Issue #201: `\\b` matches inside backtick segments because both `.`
+    and `` ` `` are non-word characters. A registered bare-name like
+    ``unit_economics`` would otherwise match inside a user-supplied full
+    backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
+    falsely rewritten — corrupting the user's intended SQL.
+    """
+    return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
+
+
 def _default_remote_query_cap_bytes() -> int:
    """5 GiB default cap on /api/query BQ-touching scans. Configurable via
    `data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
@ -197,11 +220,18 @@ def execute_query(
                if r.get("name") and r.get("id") in allowed_ids
            }

-            # Check if query references any forbidden tables (word-boundary match)
+            # Check if query references any forbidden tables (word-boundary
+            # match). Issue #201: mask backtick segments so `\b` doesn't
+            # falsely fire inside a user-supplied full backtick path like
+            # `<project>.<dataset>.<table>` whose final segment happens to
+            # collide with a forbidden master view name. The full-path
+            # registry-gate downstream is the proper authorization check
+            # for those.
+            sql_lower_masked = _mask_backticks(sql_lower)
            forbidden = all_views - allowed_view_names
            for table in forbidden:
                pattern = r'\b' + re.escape(table.lower()) + r'\b'
-                if re.search(pattern, sql_lower):
+                if re.search(pattern, sql_lower_masked):
                    raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")

        # ---- #160 BQ remote-row guardrail + RBAC patch -------------------
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
    name_lookups: list = []
    seen_paths: set = set()
    accessible_set = set(allowed) if allowed is not None else None
+    # Issue #201: mask backtick segments so a registered bare name like
+    # `unit_economics` doesn't false-positive on a user-supplied full
+    # backtick path `<project>.<dataset>.unit_economics`. The full-path
+    # pass below registry-gates those properly.
+    sql_lower_masked = _mask_backticks(sql_lower)
    for r in repo.list_by_source("bigquery"):
        if (r.get("query_mode") or "") != "remote":
            continue
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
            # before we get here. Defensive skip.
            continue
        pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
-        if re.search(pattern, sql_lower):
+        if re.search(pattern, sql_lower_masked):
            key = (bucket.lower(), source_table.lower())
            if key not in seen_paths:
                seen_paths.add(key)
@ -529,6 +564,66 @@ def _bq_guardrail_inputs(
                seen_paths.add(key)
                dry_run.append((bucket, source_table, 0))

+    # 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
+    # Pre-#201 these bypassed Agnes RBAC entirely — only the configured
+    # service account scope limited which tables a user could reach. Gate
+    # them identically to the `bq.<ds>.<tbl>` pass: must match the
+    # configured data project, must point at a registered row, and the
+    # caller must hold a grant on that row's id (admin bypasses the grant
+    # check but still requires registration + project match).
+    #
+    # Lazy `get_bq_access()` import via the module-level alias so tests
+    # can monkeypatch a fake. When BQ isn't configured (no data project),
+    # fall through silently — full backtick paths can't possibly resolve
+    # against this instance, so leave them to BQ to reject if a query
+    # somehow makes it through.
+    try:
+        bq = get_bq_access()
+        data_project = (bq.projects.data or "").strip()
+    except Exception:
+        data_project = ""
+
+    if data_project:
+        for m in _BACKTICK_FULL_PATH.finditer(sql):
+            proj, ds, tbl = m.group(1), m.group(2), m.group(3)
+            if proj.lower() != data_project.lower():
+                return [], [], {
+                    "reason": "bq_path_cross_project",
+                    "path": f"`{proj}.{ds}.{tbl}`",
+                    "expected_project": data_project,
+                    "hint": (
+                        "--remote queries can only reference tables in the "
+                        "configured BigQuery data project. Register "
+                        "cross-project tables via `agnes admin "
+                        "register-table` if needed."
+                    ),
+                }
+            row = repo.find_by_bq_path(ds, tbl)
+            if row is None:
+                return [], [], {
+                    "reason": "bq_path_not_registered",
+                    "path": f"`{proj}.{ds}.{tbl}`",
+                    "hint": (
+                        "Direct BigQuery paths must point to a registered "
+                        "table. Register via `agnes admin register-table` "
+                        "or use the registered name from `agnes catalog`."
+                    ),
+                }
+            if not is_admin:
+                if accessible_set is None or row["id"] not in accessible_set:
+                    return [], [], {
+                        "reason": "bq_path_access_denied",
+                        "path": f"`{proj}.{ds}.{tbl}`",
+                        "registered_as": row["name"],
+                    }
+            bucket = row["bucket"]
+            source_table = row["source_table"]
+            if bucket and source_table:
+                key = (bucket.lower(), source_table.lower())
+                if key not in seen_paths:
+                    seen_paths.add(key)
+                    dry_run.append((bucket, source_table, 0))
+
    return dry_run, name_lookups, None


@ -579,6 +674,15 @@ def _rewrite_bq_table_refs_to_native(
    # name up in a case-insensitive dict. Single-pass means freshly
    # inserted backticked text isn't re-scanned, fixing the
    # project-ID-contains-name corruption (Devin Review on query.py:464).
+    #
+    # Issue #201: split the SQL on `…` segments and rewrite ONLY in the
+    # outside-backtick chunks. Without this, a user-supplied full backtick
+    # path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
+    # segment matches a registered bare name would have the bare-name
+    # regex fire INSIDE the backticks (since `\\b` treats both `.` and
+    # `` ` `` as non-word boundaries), producing malformed nested
+    # backticks. Splitting confines the rewrite to user identifier
+    # positions where bare-name resolution is the intended behaviour.
    if name_lookups:
        # Map name (lower-cased) → backticked target. Names are
        # case-insensitive on the input side per the existing helper
@ -598,7 +702,15 @@ def _rewrite_bq_table_refs_to_native(
        def _name_repl(m: re.Match) -> str:
            return name_to_target[m.group(1).lower()]

-        out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
+        # `re.split` with a captured group returns: [outside, backtick,
+        # outside, backtick, …]. Even indices are outside-backtick chunks
+        # eligible for bare-name rewrite; odd indices are full backtick
+        # segments preserved verbatim.
+        parts = re.split(r'(`[^`]*`)', out)
+        for i, part in enumerate(parts):
+            if i % 2 == 0:
+                parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
+        out = "".join(parts)

    # Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
    def _bq_path_repl(m: re.Match) -> str:
@ -675,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
        return user_sql, False

    # Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
-    # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`.
+    # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
+    # bare-name regex must run against a backtick-masked copy so a
+    # registered name like ``orders`` doesn't false-positive when it
+    # appears as the table segment of a user-supplied full backtick path
+    # like ``\\`<project>.<dataset>.orders\\```. Without masking, the
+    # cross-source check below would falsely conclude the SQL touches
+    # both BQ-remote and local sources, dropping every backtick-path
+    # query into the 50-100× slower ATTACH-catalog fallback. Devin
+    # Review on PR #208.
    sql_lower = user_sql.lower()
+    sql_lower_masked = _mask_backticks(sql_lower)
    name_lookups: list = []
    seen_paths: set = set()

@ -715,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
            # mix rewritten and non-rewritten BQ paths in one query.
            return user_sql, False
        pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
-        if re.search(pattern, sql_lower):
+        if re.search(pattern, sql_lower_masked):
            key = (bucket.lower(), source_table.lower())
            if key not in seen_paths:
                seen_paths.add(key)
@ -752,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
            # Same name registered both BQ-remote and local? Pathological;
            # skip as a safety measure.
            return user_sql, False
-        if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower):
+        if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
            logger.info(
                "rewrite_skip_cross_source: user SQL references both "
                "BQ-remote and local-mode tables; falling back to "
@ -839,28 +960,32 @@ def _bq_quota_and_cap_guard(
    partitioned/clustered tables and rejecting narrow queries that BQ
    itself would dry-run as a few MB.

-    Fallback: if BQ rejects the rewritten SQL with a parse-level
-    ``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that
-    don't translate to BQ), fall back to the pre-#171 per-table
-    SELECT * approach so the cap-guard still functions — over-estimate
-    is preferred over fail-open. Forbidden / upstream errors still
-    propagate as HTTP 502.
+    Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level
+    ``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or
+    a rewriter bug that broke valid BQ-native input), retry with the
+    user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the
+    original ALSO fails, return a structured `remote_estimate_failed`
+    HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table
+    over-estimate. The synthetic fallback threw away user filters and
+    routinely ballooned to "full table size", blocking legitimate narrow
+    queries via `remote_scan_too_large`. Forbidden / upstream errors
+    still propagate as HTTP 502.

    Flow:
    1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work.
    2. `quota.acquire(user_id)` opened — concurrent-slot held throughout.
    3. Single dry-run of rewritten user SQL → `total_bytes`.
-       On parse error, fall back to per-table SELECT * → sum.
+       On parse error, retry with the user's original SQL.
+       On second parse error, raise 400 `remote_estimate_failed`.
    4. If total > cap → 400 `remote_scan_too_large`.
    5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
    6. On exit, slot released.

    Mutates `dry_run_set` in place: the third tuple element (bytes) is
    populated so the caller can sum and record bytes against the user's
-    quota post-flight. Single-dry-run path puts `total_bytes` on the first
-    entry and zero on the rest (BQ doesn't expose per-table bytes for a
-    composite query); the caller's `sum(b for _, _, b in dry_run_set)`
-    still equals `total_bytes`.
+    quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest
+    — BQ doesn't expose per-table bytes for a composite query — so
+    `sum(b for _, _, b in dry_run_set)` still equals `total_bytes`.
    """
    quota = _build_quota_tracker()
    try:
@ -901,61 +1026,71 @@ def _bq_quota_and_cap_guard(
                sql, name_lookups, project,
            )

-            # Try the single-dry-run path first (issue #171). Falls back
-            # to the per-table SELECT * approach only on BQ parse errors
-            # (kind="bq_bad_request" — DuckDB-only syntax that BQ can't
-            # translate). All other BQ errors propagate as 502 below.
+            # Try the single-dry-run path first (issue #171). On BQ parse
+            # errors (`bq_bad_request` — typically DuckDB-only syntax the
+            # rewriter couldn't translate, OR — pre-#201 fix — a
+            # rewriter-corrupted backtick path) retry the user's ORIGINAL
+            # SQL: when the user submitted BQ-native SQL, the rewriter is
+            # the only thing standing between them and a clean dry-run.
+            # If the original ALSO fails, this is true DuckDB-only syntax
+            # that BQ cannot estimate — fail fast with a structured
+            # `remote_estimate_failed` instead of the pre-#201 synthetic
+            # `SELECT *` over-estimate (which threw away user filters and
+            # often ballooned to "full table size", blocking legitimate
+            # narrow queries via `remote_scan_too_large`).
+            #
+            # All other BQ errors (forbidden, upstream) propagate as 502.
            total_bytes = 0
-            used_fallback = False
            try:
                total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
            except BqAccessError as exc:
-                if exc.kind == "bq_bad_request":
-                    logger.warning(
-                        "BQ dry-run rejected the rewritten SQL "
-                        "(kind=%s, message=%s). Falling back to per-table "
-                        "SELECT * estimate; the cap check will over-estimate "
-                        "scan bytes for this query. Consider rewriting to "
-                        "BQ-native syntax for a tight pre-check.",
-                        exc.kind, exc.message,
-                    )
-                    used_fallback = True
-                else:
+                if exc.kind != "bq_bad_request":
                    raise HTTPException(status_code=502, detail={
                        "kind": exc.kind,
                        "message": exc.message,
                        **(exc.details or {}),
                    })
-
-            if used_fallback:
-                # Pre-#171 path: estimate per registered table from a
-                # synthetic SELECT *. Over-estimates partitioned scans but
-                # never under-estimates, so the cap still bounds risk.
-                for i, (bucket, source_table, _) in enumerate(dry_run_set):
-                    fallback_sql = (
-                        f"SELECT * FROM `{project}.{bucket}.{source_table}`"
-                    )
-                    try:
-                        est = _bq_dry_run_bytes(bq, fallback_sql)
-                    except BqAccessError as exc:
+                logger.warning(
+                    "BQ dry-run rejected the rewritten SQL "
+                    "(kind=%s, message=%s). Retrying with the user's "
+                    "original SQL.",
+                    exc.kind, exc.message,
+                )
+                try:
+                    total_bytes = _bq_dry_run_bytes(bq, sql)
+                except BqAccessError as exc2:
+                    if exc2.kind != "bq_bad_request":
                        raise HTTPException(status_code=502, detail={
-                            "kind": exc.kind,
-                            "message": exc.message,
-                            **(exc.details or {}),
+                            "kind": exc2.kind,
+                            "message": exc2.message,
+                            **(exc2.details or {}),
                        })
-                    dry_run_set[i] = (bucket, source_table, est)
-                    total_bytes += est
-            else:
-                # Single-dry-run path. Distribute the total to dry_run_set
-                # so the caller's `record_bytes(sum(...))` stays correct.
-                # Per-table breakdown is unavailable from a composite
-                # dry-run; pin total to entry 0, zero the rest.
-                if dry_run_set:
-                    b0, t0, _ = dry_run_set[0]
-                    dry_run_set[0] = (b0, t0, total_bytes)
-                    for i in range(1, len(dry_run_set)):
-                        bi, ti, _ = dry_run_set[i]
-                        dry_run_set[i] = (bi, ti, 0)
+                    raise HTTPException(status_code=400, detail={
+                        "kind": "remote_estimate_failed",
+                        "message": (
+                            "Could not estimate scan size for this query."
+                        ),
+                        "hint": (
+                            "Use a registered table name from `agnes "
+                            "catalog`, or write BQ-native SQL with full "
+                            "backtick paths. Pure DuckDB-only syntax is "
+                            "not supported for --remote queries."
+                        ),
+                        "underlying": exc2.message,
+                    })
+
+            # Distribute the total to dry_run_set so the caller's
+            # `record_bytes(sum(...))` stays correct. Per-table breakdown
+            # is unavailable from a composite dry-run; pin total to entry
+            # 0, zero the rest. (Same accounting symmetry whether the
+            # bytes came from the rewritten SQL or the original-SQL
+            # retry.)
+            if dry_run_set:
+                b0, t0, _ = dry_run_set[0]
+                dry_run_set[0] = (b0, t0, total_bytes)
+                for i in range(1, len(dry_run_set)):
+                    bi, ti, _ = dry_run_set[i]
+                    dry_run_set[i] = (bi, ti, 0)

            if cap_bytes > 0 and total_bytes > cap_bytes:
                tables = [f"{b}.{t}" for b, t, _ in dry_run_set]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "agnes-the-ai-analyst"
-version = "0.41.0"
+version = "0.42.0"
 description = "Agnes — AI Data Analyst platform for AI analytical systems"
 requires-python = ">=3.11,<3.14"
 license = "MIT"
--- a/tests/test_api_query_guardrail.py
+++ b/tests/test_api_query_guardrail.py
@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request(
    assert "`test-data-prj.marketing.traffic`" in state["last_sql"]


-def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
+def test_fallback_tries_original_sql_first(
    seeded_app, mock_dry_run, monkeypatch,
 ):
-    """When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only
-    syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …),
-    the cap-guard falls back to the pre-#171 per-table SELECT * approach
-    so a non-portable query still gets a (loose) cap estimate instead of
-    fail-opening.
-    """
+    """Issue #201 — when the rewriter produces SQL that BQ rejects with
+    `bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the
+    cap-guard uses the original SQL's byte estimate. No more synthetic
+    `SELECT *` over-estimate.
+
+    Bare-name reference populates `dry_run_set` so the cap-guard
+    actually fires. Mock returns parse-error on the first call
+    (rewritten SQL) and small bytes on the second (original)."""
    from connectors.bigquery.access import BqAccessError

    _register_bq_remote_row("ue", "finance", "ue")
@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(

    def fake_dry_run(_bq, sql):
        state["calls"].append(sql)
-        # First call (rewritten user SQL) → BQ parse error.
+        # First call (rewritten SQL) → BQ parse error.
        if len(state["calls"]) == 1:
-            raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
-        # Second call (fallback per-table SELECT *) → small bytes, pass cap.
+            raise BqAccessError("bq_bad_request", "Syntax error: simulated")
+        # Second call (the user's original SQL) → small, passes cap.
        return 4096

    monkeypatch.setattr(
@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(

    c = seeded_app["client"]
    token = seeded_app["admin_token"]
-    # SQL with DuckDB-only `::INT` cast that BQ would reject.
+    user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'"
+    r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
+
+    # Two dry-runs: rewritten then original. No third synthetic-SELECT-*
+    # call.
+    assert len(state["calls"]) == 2, (
+        f"expected rewritten + original-SQL retry, got "
+        f"{len(state['calls'])}: {state['calls']}"
+    )
+    assert state["calls"][1] == user_sql, (
+        f"second call must be the user's ORIGINAL SQL, got "
+        f"{state['calls'][1]!r}"
+    )
+    # The response must NOT be remote_scan_too_large from a synthetic
+    # over-estimate — 4096 bytes is well under the 5 GiB cap.
+    if r.status_code == 400:
+        detail = r.json().get("detail", {})
+        if isinstance(detail, dict):
+            assert detail.get("reason") != "remote_scan_too_large", detail
+
+
+def test_fallback_fails_fast_on_pure_duckdb_syntax(
+    seeded_app, mock_dry_run, monkeypatch,
+):
+    """When BOTH the rewritten and original SQL fail with `bq_bad_request`
+    (true DuckDB-only syntax like `::INT`), return HTTP 400
+    `remote_estimate_failed` — never silently over-estimate via a
+    synthetic `SELECT *`."""
+    from connectors.bigquery.access import BqAccessError
+
+    _register_bq_remote_row("ue", "finance", "ue")
+
+    state = {"calls": []}
+
+    def always_parse_error(_bq, sql):
+        state["calls"].append(sql)
+        raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
+
+    monkeypatch.setattr(
+        "app.api.query._bq_dry_run_bytes", always_parse_error, raising=False,
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
        headers=_auth(token),
    )

-    # Two dry-runs (rewritten + fallback per-table) before the (failed)
-    # execute. Status will be a downstream error from analytics.execute()
-    # since `::INT` doesn't work in DuckDB either against a remote view —
-    # but the GUARDRAIL must have completed without 5xx-ing.
+    # Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback.
    assert len(state["calls"]) == 2, (
-        f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: "
-        f"{state['calls']}"
+        f"expected 1 rewritten + 1 original-retry, got "
+        f"{len(state['calls'])}: {state['calls']}"
    )
-    assert "::" in state["calls"][0], "first call should be the rewritten user SQL"
-    assert state["calls"][1].startswith("SELECT * FROM"), (
-        "second call should be the per-table fallback"
-    )
-    # Whatever HTTP status comes back must NOT be 502 from the guard's
-    # transport-error path — fallback must absorb the bq_bad_request.
-    assert r.status_code != 502, r.json()
+    # No call should be a synthetic ``SELECT * FROM `<project>...```. The
+    # original-SQL retry contains the user's SELECT clause.
+    for c_sql in state["calls"]:
+        # If a call is just a synthetic ``SELECT * FROM `<project>.<bucket>.<table>```
+        # the user's `WHERE country = 'CZ'` would be missing.
+        if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql:
+            raise AssertionError(
+                f"synthetic SELECT * fallback was used: {c_sql!r}"
+            )
+
+    assert r.status_code == 400, r.json()
+    detail = r.json().get("detail", {})
+    assert isinstance(detail, dict), detail
+    assert detail.get("kind") == "remote_estimate_failed", detail
+    assert "underlying" in detail, detail
+    assert "agnes catalog" in detail.get("hint", "").lower() or \
+           "backtick" in detail.get("hint", "").lower(), detail


 def test_guardrail_propagates_502_on_non_parse_bq_errors(
@ -432,3 +484,243 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
    )
    assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
           rewritten.lower().count("`p.fin.ue`") == 2
+
+
+# ---------------------------------------------------------------------------
+# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
+# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
+# table segment matches a registered bare name was being re-substituted
+# inside the backticks, producing malformed nested-backtick SQL that BQ
+# rejected with a parse error.
+# ---------------------------------------------------------------------------
+
+
+def test_rewrite_skips_inside_backtick_path():
+    """Full backtick BQ path is preserved byte-for-byte even when its
+    final segment matches a registered bare-name alias."""
+    from app.api.query import _rewrite_user_sql_for_bq_dry_run
+
+    sql = (
+        "SELECT * FROM `my-prj.finance.unit_economics` "
+        "WHERE country = 'CZ'"
+    )
+    rewritten = _rewrite_user_sql_for_bq_dry_run(
+        sql=sql,
+        name_lookups=[("unit_economics", "finance", "unit_economics")],
+        project="my-prj",
+    )
+    # No corruption — input is already BQ-native, rewriter is a no-op here.
+    assert rewritten == sql, (
+        f"backtick path was rewritten:\n  in : {sql!r}\n  out: {rewritten!r}"
+    )
+    # Sanity: the malformed nested form must NOT appear.
+    assert "`my-prj.finance.`my-prj" not in rewritten
+
+
+def test_rewrite_skips_inside_backtick_with_outside_bare_name():
+    """Mixed SQL: a bare name outside backticks is rewritten as before,
+    but an identically-named segment inside a backtick path is left
+    alone."""
+    from app.api.query import _rewrite_user_sql_for_bq_dry_run
+
+    sql = (
+        "SELECT a.id, b.col FROM ue a "
+        "JOIN `my-prj.finance.ue` b ON a.id = b.id"
+    )
+    rewritten = _rewrite_user_sql_for_bq_dry_run(
+        sql=sql,
+        name_lookups=[("ue", "fin_alias", "ue_alias")],
+        project="my-prj",
+    )
+    # Outside-backtick `ue` rewrites to the registered alias path.
+    assert "`my-prj.fin_alias.ue_alias`" in rewritten
+    # The user-supplied backtick path is preserved verbatim.
+    assert "`my-prj.finance.ue`" in rewritten
+    # The malformed nested form must NOT appear.
+    assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
+
+
+def test_guardrail_skips_bare_name_match_inside_backticks(
+    seeded_app, mock_dry_run, monkeypatch,
+):
+    """The `name_lookups` collection populated by `_bq_guardrail_inputs`
+    must not include a registered name when the only place that name
+    appears in the SQL is inside a `…` backtick segment.
+
+    Captures the rewritten SQL the guardrail forwards to the dry-run and
+    asserts the bare-name was NOT substituted inside the user's backtick
+    path.
+    """
+    _register_bq_remote_row("unit_economics", "finance", "unit_economics")
+
+    captured = {"sql": None}
+
+    def capturing_fake(_bq, sql):
+        captured["sql"] = sql
+        return 1024
+
+    monkeypatch.setattr(
+        "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    user_sql = (
+        "SELECT * FROM `test-data-prj.finance.unit_economics` "
+        "WHERE country = 'CZ'"
+    )
+    c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
+
+    sent = captured["sql"]
+    if sent is None:
+        # Guardrail decided no BQ tables were referenced — that's also
+        # an acceptable "no false-positive" outcome (Layer 3 will cover
+        # the explicit registry check for full backtick paths). We just
+        # need to ensure the bare-name regex didn't fire.
+        return
+    # The user's exact backtick path must survive verbatim — no nested
+    # backticks introduced by a stray bare-name rewrite.
+    assert "`test-data-prj.finance.unit_economics`" in sent, (
+        f"backtick path corrupted by guardrail:\n  out: {sent!r}"
+    )
+    assert "`test-data-prj.finance.`test-data-prj" not in sent, (
+        f"nested-backtick corruption signature present: {sent!r}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
+# Pre-fix these bypassed Agnes RBAC entirely — only the configured service
+# account scope limited which tables a user could reach. Post-fix, they're
+# treated identically to `bq."<dataset>"."<table>"` syntax.
+# ---------------------------------------------------------------------------
+
+
+def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
+    """Full backtick path to an unregistered `<dataset>.<table>` (project
+    matches the configured data project) → HTTP 403 with
+    `bq_path_not_registered`."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/query",
+        json={
+            "sql": (
+                "SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
+                "WHERE country = 'CZ'"
+            ),
+        },
+        headers=_auth(token),
+    )
+    assert r.status_code == 403, r.json()
+    detail = r.json().get("detail", {})
+    assert isinstance(detail, dict), detail
+    assert detail.get("reason") == "bq_path_not_registered", detail
+    assert "secret_ds" in detail.get("path", ""), detail
+    assert "secret_tbl" in detail.get("path", ""), detail
+
+
+def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
+    """Full backtick path with project ≠ configured data project → HTTP
+    403 with `bq_path_cross_project`. Even if the path happens to point
+    at a registered (bucket, source_table), the project mismatch is the
+    primary boundary."""
+    _register_bq_remote_row("ue", "finance", "ue")
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/query",
+        json={
+            "sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
+        },
+        headers=_auth(token),
+    )
+    assert r.status_code == 403, r.json()
+    detail = r.json().get("detail", {})
+    assert isinstance(detail, dict), detail
+    assert detail.get("reason") == "bq_path_cross_project", detail
+    assert detail.get("expected_project") == "test-data-prj", detail
+    assert "other-project" in detail.get("path", ""), detail
+
+
+def test_full_backtick_path_registered_admin_passes(
+    seeded_app, mock_dry_run, monkeypatch,
+):
+    """Admin caller + registered path + matching project → no RBAC
+    rejection. The dry-run fires (we can capture the SQL the guardrail
+    forwards) and no `bq_path_*` reason appears in any error response."""
+    _register_bq_remote_row("ue", "finance", "ue")
+
+    captured = {"sql": None}
+
+    def capturing_fake(_bq, sql):
+        captured["sql"] = sql
+        return 1024  # tiny — pass cap
+
+    monkeypatch.setattr(
+        "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/query",
+        json={
+            "sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
+        },
+        headers=_auth(token),
+    )
+    # If 403, must NOT be the issue-#201 bq_path_* reasons.
+    if r.status_code == 403:
+        detail = r.json().get("detail", {})
+        if isinstance(detail, dict):
+            assert detail.get("reason") not in (
+                "bq_path_not_registered",
+                "bq_path_access_denied",
+                "bq_path_cross_project",
+            ), f"admin + registered path should pass RBAC: {detail}"
+    # The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
+    # and the cap-guard fired. The user's WHERE clause must still be in
+    # the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
+    assert captured["sql"] is not None, (
+        "dry-run never fired — Pass 3 may not have registered the path"
+    )
+    assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
+    assert "WHERE id = 1" in captured["sql"], captured["sql"]
+
+
+def test_full_backtick_path_inside_string_literal_not_gated(
+    seeded_app, mock_dry_run,
+):
+    """Defensive case: a backtick path appearing inside a SQL string
+    literal (rare but possible) should not trigger Pass 3. Practically
+    this is unreachable because backticks aren't typically valid inside
+    BQ string literals — but the regex doesn't know that. We document
+    that the gate applies to ALL backtick triples to be safe; users who
+    really need a literal can use single-quoted strings without
+    backticks."""
+    # No registration; the test confirms an unregistered path inside
+    # what looks like a string is still gated. This is the conservative
+    # boundary — false-positive on string literal beats false-negative
+    # on a real RBAC bypass.
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/query",
+        json={
+            "sql": (
+                "SELECT 'matches `test-data-prj.x.y`' AS lit"
+            ),
+        },
+        headers=_auth(token),
+    )
+    # Either gated (403) or 200 if the analytics DB happens to evaluate
+    # the literal — both are acceptable. The point is no silent RBAC
+    # bypass: if the response is 200, no BQ table was reached.
+    if r.status_code == 403:
+        detail = r.json().get("detail", {})
+        if isinstance(detail, dict):
+            assert detail.get("reason") in (
+                "bq_path_not_registered",
+                "bq_path_cross_project",
+            ), detail
--- a/tests/test_query_remote_rewrite.py
+++ b/tests/test_query_remote_rewrite.py
@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
    assert rewritten == user_sql  # untouched


+def test_local_name_inside_backtick_path_does_not_trip_cross_source(
+    seeded_registry, monkeypatch,
+):
+    """Devin Review on PR #208 (issue #201 follow-up): a registered
+    LOCAL-mode table name appearing as a segment of a user-supplied full
+    backtick BQ path must NOT trip the cross-source guard. Pre-fix the
+    bare-name regex at the cross-source check ran against unmasked
+    sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
+    local ``orders`` inside the backticks and force the wrapper to bail
+    to the ATTACH-catalog slow path (50-100× slower). Post-fix the
+    regex runs against the backtick-masked copy, the cross-source check
+    correctly sees only BQ refs, and the wrap proceeds.
+    """
+    from app.api.query import _rewrite_user_sql_for_bigquery_query
+    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
+                        bucket="fin", source_table="ue")
+    _register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
+    _set_bq_project(monkeypatch, "test-prj")
+
+    user_sql = (
+        "SELECT u.id "
+        "FROM ue u "
+        "JOIN `test-prj.dataset.orders` o ON u.x = o.x "
+        "WHERE o.value > 0"
+    )
+    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
+        user_sql, seeded_registry,
+    )
+    # Must wrap — both refs are BQ; the local `orders` registration is
+    # irrelevant to a query that touches only BQ paths.
+    assert did_rewrite is True
+    assert "bigquery_query(" in rewritten
+    # The user's backtick path is preserved verbatim inside the wrapped
+    # inner SQL (Layer 1 split-on-backticks behaviour), so the original
+    # `test-prj.dataset.orders` reference survives.
+    assert "test-prj.dataset.orders" in rewritten
+
+
 def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
    """User SQL referencing only local-source tables → no rewrite,
    no log spam, original SQL returned."""