fix(query): rewriter respects backtick segments (#201)

`agnes query --remote` corrupted user SQL when the request contained a
full BigQuery backtick path (`<project>.<dataset>.<table>`) whose
table segment matched a registered bare-name alias. The bare-name
rewriter used `\b` word-boundary matching against the lower-cased SQL;
both `.` and `` ` `` are non-word characters, so the regex fired
INSIDE the user's backtick path and produced malformed nested-backtick
SQL that BigQuery rejected at parse time.

Fix:

- Add `_mask_backticks(sql)` helper: replace each `…` segment with
  spaces of equal length, preserving offsets so word-boundary
  searches find positions only outside backticks.
- `_bq_guardrail_inputs` (bare-name pass + forbidden-table pass)
  searches against the masked SQL.
- `_rewrite_bq_table_refs_to_native` Pass 1 splits the SQL on
  `(\`[^\`]*\`)` and rewrites only the outside-backtick chunks. Pass
  2 (`bq."ds"."tbl"` → backtick form) is unchanged — its prefix can't
  appear inside backticks.

Adds three regressions covering the rewrite + guardrail paths.
This commit is contained in:
ZdenekSrotyr 2026-05-06 17:20:33 +02:00
parent 1b49de1568
commit 720a2180c0
2 changed files with 158 additions and 4 deletions

View file

@ -90,6 +90,29 @@ BQ_PATH = re.compile(
)
# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
# bare-name regexes from firing inside backtick-quoted segments.
_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
def _mask_backticks(sql: str) -> str:
"""Replace each `…`-quoted segment with spaces of equal length so
word-boundary regexes find positions outside backticks but ignore
everything inside. Preserves all character offsets so ``re.search``
on the masked string returns matches at the same positions as on the
original.
Issue #201: `\\b` matches inside backtick segments because both `.`
and `` ` `` are non-word characters. A registered bare-name like
``unit_economics`` would otherwise match inside a user-supplied full
backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
falsely rewritten corrupting the user's intended SQL.
"""
return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
def _default_remote_query_cap_bytes() -> int:
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config
@ -197,11 +220,18 @@ def execute_query(
if r.get("name") and r.get("id") in allowed_ids
}
# Check if query references any forbidden tables (word-boundary match)
# Check if query references any forbidden tables (word-boundary
# match). Issue #201: mask backtick segments so `\b` doesn't
# falsely fire inside a user-supplied full backtick path like
# `<project>.<dataset>.<table>` whose final segment happens to
# collide with a forbidden master view name. The full-path
# registry-gate downstream is the proper authorization check
# for those.
sql_lower_masked = _mask_backticks(sql_lower)
forbidden = all_views - allowed_view_names
for table in forbidden:
pattern = r'\b' + re.escape(table.lower()) + r'\b'
if re.search(pattern, sql_lower):
if re.search(pattern, sql_lower_masked):
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
name_lookups: list = []
seen_paths: set = set()
accessible_set = set(allowed) if allowed is not None else None
# Issue #201: mask backtick segments so a registered bare name like
# `unit_economics` doesn't false-positive on a user-supplied full
# backtick path `<project>.<dataset>.unit_economics`. The full-path
# pass below registry-gates those properly.
sql_lower_masked = _mask_backticks(sql_lower)
for r in repo.list_by_source("bigquery"):
if (r.get("query_mode") or "") != "remote":
continue
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
# before we get here. Defensive skip.
continue
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
if re.search(pattern, sql_lower):
if re.search(pattern, sql_lower_masked):
key = (bucket.lower(), source_table.lower())
if key not in seen_paths:
seen_paths.add(key)
@ -579,6 +614,15 @@ def _rewrite_bq_table_refs_to_native(
# name up in a case-insensitive dict. Single-pass means freshly
# inserted backticked text isn't re-scanned, fixing the
# project-ID-contains-name corruption (Devin Review on query.py:464).
#
# Issue #201: split the SQL on `…` segments and rewrite ONLY in the
# outside-backtick chunks. Without this, a user-supplied full backtick
# path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
# segment matches a registered bare name would have the bare-name
# regex fire INSIDE the backticks (since `\\b` treats both `.` and
# `` ` `` as non-word boundaries), producing malformed nested
# backticks. Splitting confines the rewrite to user identifier
# positions where bare-name resolution is the intended behaviour.
if name_lookups:
# Map name (lower-cased) → backticked target. Names are
# case-insensitive on the input side per the existing helper
@ -598,7 +642,15 @@ def _rewrite_bq_table_refs_to_native(
def _name_repl(m: re.Match) -> str:
return name_to_target[m.group(1).lower()]
out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
# `re.split` with a captured group returns: [outside, backtick,
# outside, backtick, …]. Even indices are outside-backtick chunks
# eligible for bare-name rewrite; odd indices are full backtick
# segments preserved verbatim.
parts = re.split(r'(`[^`]*`)', out)
for i, part in enumerate(parts):
if i % 2 == 0:
parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
out = "".join(parts)
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
def _bq_path_repl(m: re.Match) -> str:

View file

@ -432,3 +432,105 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
)
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
rewritten.lower().count("`p.fin.ue`") == 2
# ---------------------------------------------------------------------------
# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
# table segment matches a registered bare name was being re-substituted
# inside the backticks, producing malformed nested-backtick SQL that BQ
# rejected with a parse error.
# ---------------------------------------------------------------------------
def test_rewrite_skips_inside_backtick_path():
"""Full backtick BQ path is preserved byte-for-byte even when its
final segment matches a registered bare-name alias."""
from app.api.query import _rewrite_user_sql_for_bq_dry_run
sql = (
"SELECT * FROM `my-prj.finance.unit_economics` "
"WHERE country = 'CZ'"
)
rewritten = _rewrite_user_sql_for_bq_dry_run(
sql=sql,
name_lookups=[("unit_economics", "finance", "unit_economics")],
project="my-prj",
)
# No corruption — input is already BQ-native, rewriter is a no-op here.
assert rewritten == sql, (
f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}"
)
# Sanity: the malformed nested form must NOT appear.
assert "`my-prj.finance.`my-prj" not in rewritten
def test_rewrite_skips_inside_backtick_with_outside_bare_name():
"""Mixed SQL: a bare name outside backticks is rewritten as before,
but an identically-named segment inside a backtick path is left
alone."""
from app.api.query import _rewrite_user_sql_for_bq_dry_run
sql = (
"SELECT a.id, b.col FROM ue a "
"JOIN `my-prj.finance.ue` b ON a.id = b.id"
)
rewritten = _rewrite_user_sql_for_bq_dry_run(
sql=sql,
name_lookups=[("ue", "fin_alias", "ue_alias")],
project="my-prj",
)
# Outside-backtick `ue` rewrites to the registered alias path.
assert "`my-prj.fin_alias.ue_alias`" in rewritten
# The user-supplied backtick path is preserved verbatim.
assert "`my-prj.finance.ue`" in rewritten
# The malformed nested form must NOT appear.
assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
def test_guardrail_skips_bare_name_match_inside_backticks(
seeded_app, mock_dry_run, monkeypatch,
):
"""The `name_lookups` collection populated by `_bq_guardrail_inputs`
must not include a registered name when the only place that name
appears in the SQL is inside a `` backtick segment.
Captures the rewritten SQL the guardrail forwards to the dry-run and
asserts the bare-name was NOT substituted inside the user's backtick
path.
"""
_register_bq_remote_row("unit_economics", "finance", "unit_economics")
captured = {"sql": None}
def capturing_fake(_bq, sql):
captured["sql"] = sql
return 1024
monkeypatch.setattr(
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
)
c = seeded_app["client"]
token = seeded_app["admin_token"]
user_sql = (
"SELECT * FROM `test-data-prj.finance.unit_economics` "
"WHERE country = 'CZ'"
)
c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
sent = captured["sql"]
if sent is None:
# Guardrail decided no BQ tables were referenced — that's also
# an acceptable "no false-positive" outcome (Layer 3 will cover
# the explicit registry check for full backtick paths). We just
# need to ensure the bare-name regex didn't fire.
return
# The user's exact backtick path must survive verbatim — no nested
# backticks introduced by a stray bare-name rewrite.
assert "`test-data-prj.finance.unit_economics`" in sent, (
f"backtick path corrupted by guardrail:\n out: {sent!r}"
)
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
f"nested-backtick corruption signature present: {sent!r}"
)