fix(query): rewriter respects backtick segments (#201)

`agnes query --remote` corrupted user SQL when the request contained a full BigQuery backtick path (`<project>.<dataset>.<table>`) whose table segment matched a registered bare-name alias. The bare-name rewriter used `\b` word-boundary matching against the lower-cased SQL; both `.` and `` ` `` are non-word characters, so the regex fired INSIDE the user's backtick path and produced malformed nested-backtick SQL that BigQuery rejected at parse time. Fix: - Add `_mask_backticks(sql)` helper: replace each `…` segment with spaces of equal length, preserving offsets so word-boundary searches find positions only outside backticks. - `_bq_guardrail_inputs` (bare-name pass + forbidden-table pass) searches against the masked SQL. - `_rewrite_bq_table_refs_to_native` Pass 1 splits the SQL on `(\`[^\`]*\`)` and rewrites only the outside-backtick chunks. Pass 2 (`bq."ds"."tbl"` → backtick form) is unchanged — its prefix can't appear inside backticks. Adds three regressions covering the rewrite + guardrail paths.
2026-05-06 17:20:33 +02:00 · 2026-05-06 17:20:33 +02:00 · 720a2180c0
commit 720a2180c0
parent 1b49de1568
2 changed files with 158 additions and 4 deletions
--- a/app/api/query.py
+++ b/app/api/query.py
@ -90,6 +90,29 @@ BQ_PATH = re.compile(
 )


+# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
+# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
+# bare-name regexes from firing inside backtick-quoted segments.
+_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
+_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
+
+
+def _mask_backticks(sql: str) -> str:
+    """Replace each `…`-quoted segment with spaces of equal length so
+    word-boundary regexes find positions outside backticks but ignore
+    everything inside. Preserves all character offsets so ``re.search``
+    on the masked string returns matches at the same positions as on the
+    original.
+
+    Issue #201: `\\b` matches inside backtick segments because both `.`
+    and `` ` `` are non-word characters. A registered bare-name like
+    ``unit_economics`` would otherwise match inside a user-supplied full
+    backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
+    falsely rewritten — corrupting the user's intended SQL.
+    """
+    return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
+
+
 def _default_remote_query_cap_bytes() -> int:
    """5 GiB default cap on /api/query BQ-touching scans. Configurable via
    `data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
@ -197,11 +220,18 @@ def execute_query(
                if r.get("name") and r.get("id") in allowed_ids
            }

-            # Check if query references any forbidden tables (word-boundary match)
+            # Check if query references any forbidden tables (word-boundary
+            # match). Issue #201: mask backtick segments so `\b` doesn't
+            # falsely fire inside a user-supplied full backtick path like
+            # `<project>.<dataset>.<table>` whose final segment happens to
+            # collide with a forbidden master view name. The full-path
+            # registry-gate downstream is the proper authorization check
+            # for those.
+            sql_lower_masked = _mask_backticks(sql_lower)
            forbidden = all_views - allowed_view_names
            for table in forbidden:
                pattern = r'\b' + re.escape(table.lower()) + r'\b'
-                if re.search(pattern, sql_lower):
+                if re.search(pattern, sql_lower_masked):
                    raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")

        # ---- #160 BQ remote-row guardrail + RBAC patch -------------------
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
    name_lookups: list = []
    seen_paths: set = set()
    accessible_set = set(allowed) if allowed is not None else None
+    # Issue #201: mask backtick segments so a registered bare name like
+    # `unit_economics` doesn't false-positive on a user-supplied full
+    # backtick path `<project>.<dataset>.unit_economics`. The full-path
+    # pass below registry-gates those properly.
+    sql_lower_masked = _mask_backticks(sql_lower)
    for r in repo.list_by_source("bigquery"):
        if (r.get("query_mode") or "") != "remote":
            continue
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
            # before we get here. Defensive skip.
            continue
        pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
-        if re.search(pattern, sql_lower):
+        if re.search(pattern, sql_lower_masked):
            key = (bucket.lower(), source_table.lower())
            if key not in seen_paths:
                seen_paths.add(key)
@ -579,6 +614,15 @@ def _rewrite_bq_table_refs_to_native(
    # name up in a case-insensitive dict. Single-pass means freshly
    # inserted backticked text isn't re-scanned, fixing the
    # project-ID-contains-name corruption (Devin Review on query.py:464).
+    #
+    # Issue #201: split the SQL on `…` segments and rewrite ONLY in the
+    # outside-backtick chunks. Without this, a user-supplied full backtick
+    # path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
+    # segment matches a registered bare name would have the bare-name
+    # regex fire INSIDE the backticks (since `\\b` treats both `.` and
+    # `` ` `` as non-word boundaries), producing malformed nested
+    # backticks. Splitting confines the rewrite to user identifier
+    # positions where bare-name resolution is the intended behaviour.
    if name_lookups:
        # Map name (lower-cased) → backticked target. Names are
        # case-insensitive on the input side per the existing helper
@ -598,7 +642,15 @@ def _rewrite_bq_table_refs_to_native(
        def _name_repl(m: re.Match) -> str:
            return name_to_target[m.group(1).lower()]

-        out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
+        # `re.split` with a captured group returns: [outside, backtick,
+        # outside, backtick, …]. Even indices are outside-backtick chunks
+        # eligible for bare-name rewrite; odd indices are full backtick
+        # segments preserved verbatim.
+        parts = re.split(r'(`[^`]*`)', out)
+        for i, part in enumerate(parts):
+            if i % 2 == 0:
+                parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
+        out = "".join(parts)

    # Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
    def _bq_path_repl(m: re.Match) -> str:
--- a/tests/test_api_query_guardrail.py
+++ b/tests/test_api_query_guardrail.py
@ -432,3 +432,105 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
    )
    assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
           rewritten.lower().count("`p.fin.ue`") == 2
+
+
+# ---------------------------------------------------------------------------
+# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
+# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
+# table segment matches a registered bare name was being re-substituted
+# inside the backticks, producing malformed nested-backtick SQL that BQ
+# rejected with a parse error.
+# ---------------------------------------------------------------------------
+
+
+def test_rewrite_skips_inside_backtick_path():
+    """Full backtick BQ path is preserved byte-for-byte even when its
+    final segment matches a registered bare-name alias."""
+    from app.api.query import _rewrite_user_sql_for_bq_dry_run
+
+    sql = (
+        "SELECT * FROM `my-prj.finance.unit_economics` "
+        "WHERE country = 'CZ'"
+    )
+    rewritten = _rewrite_user_sql_for_bq_dry_run(
+        sql=sql,
+        name_lookups=[("unit_economics", "finance", "unit_economics")],
+        project="my-prj",
+    )
+    # No corruption — input is already BQ-native, rewriter is a no-op here.
+    assert rewritten == sql, (
+        f"backtick path was rewritten:\n  in : {sql!r}\n  out: {rewritten!r}"
+    )
+    # Sanity: the malformed nested form must NOT appear.
+    assert "`my-prj.finance.`my-prj" not in rewritten
+
+
+def test_rewrite_skips_inside_backtick_with_outside_bare_name():
+    """Mixed SQL: a bare name outside backticks is rewritten as before,
+    but an identically-named segment inside a backtick path is left
+    alone."""
+    from app.api.query import _rewrite_user_sql_for_bq_dry_run
+
+    sql = (
+        "SELECT a.id, b.col FROM ue a "
+        "JOIN `my-prj.finance.ue` b ON a.id = b.id"
+    )
+    rewritten = _rewrite_user_sql_for_bq_dry_run(
+        sql=sql,
+        name_lookups=[("ue", "fin_alias", "ue_alias")],
+        project="my-prj",
+    )
+    # Outside-backtick `ue` rewrites to the registered alias path.
+    assert "`my-prj.fin_alias.ue_alias`" in rewritten
+    # The user-supplied backtick path is preserved verbatim.
+    assert "`my-prj.finance.ue`" in rewritten
+    # The malformed nested form must NOT appear.
+    assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
+
+
+def test_guardrail_skips_bare_name_match_inside_backticks(
+    seeded_app, mock_dry_run, monkeypatch,
+):
+    """The `name_lookups` collection populated by `_bq_guardrail_inputs`
+    must not include a registered name when the only place that name
+    appears in the SQL is inside a `…` backtick segment.
+
+    Captures the rewritten SQL the guardrail forwards to the dry-run and
+    asserts the bare-name was NOT substituted inside the user's backtick
+    path.
+    """
+    _register_bq_remote_row("unit_economics", "finance", "unit_economics")
+
+    captured = {"sql": None}
+
+    def capturing_fake(_bq, sql):
+        captured["sql"] = sql
+        return 1024
+
+    monkeypatch.setattr(
+        "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    user_sql = (
+        "SELECT * FROM `test-data-prj.finance.unit_economics` "
+        "WHERE country = 'CZ'"
+    )
+    c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
+
+    sent = captured["sql"]
+    if sent is None:
+        # Guardrail decided no BQ tables were referenced — that's also
+        # an acceptable "no false-positive" outcome (Layer 3 will cover
+        # the explicit registry check for full backtick paths). We just
+        # need to ensure the bare-name regex didn't fire.
+        return
+    # The user's exact backtick path must survive verbatim — no nested
+    # backticks introduced by a stray bare-name rewrite.
+    assert "`test-data-prj.finance.unit_economics`" in sent, (
+        f"backtick path corrupted by guardrail:\n  out: {sent!r}"
+    )
+    assert "`test-data-prj.finance.`test-data-prj" not in sent, (
+        f"nested-backtick corruption signature present: {sent!r}"
+    )