fix(query): rewriter respects backtick segments (#201)
`agnes query --remote` corrupted user SQL when the request contained a full BigQuery backtick path (`<project>.<dataset>.<table>`) whose table segment matched a registered bare-name alias. The bare-name rewriter used `\b` word-boundary matching against the lower-cased SQL; both `.` and `` ` `` are non-word characters, so the regex fired INSIDE the user's backtick path and produced malformed nested-backtick SQL that BigQuery rejected at parse time. Fix: - Add `_mask_backticks(sql)` helper: replace each `…` segment with spaces of equal length, preserving offsets so word-boundary searches find positions only outside backticks. - `_bq_guardrail_inputs` (bare-name pass + forbidden-table pass) searches against the masked SQL. - `_rewrite_bq_table_refs_to_native` Pass 1 splits the SQL on `(\`[^\`]*\`)` and rewrites only the outside-backtick chunks. Pass 2 (`bq."ds"."tbl"` → backtick form) is unchanged — its prefix can't appear inside backticks. Adds three regressions covering the rewrite + guardrail paths.
This commit is contained in:
parent
1b49de1568
commit
720a2180c0
2 changed files with 158 additions and 4 deletions
|
|
@ -90,6 +90,29 @@ BQ_PATH = re.compile(
|
|||
)
|
||||
|
||||
|
||||
# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
|
||||
# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
|
||||
# bare-name regexes from firing inside backtick-quoted segments.
|
||||
_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
|
||||
_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
|
||||
|
||||
|
||||
def _mask_backticks(sql: str) -> str:
|
||||
"""Replace each `…`-quoted segment with spaces of equal length so
|
||||
word-boundary regexes find positions outside backticks but ignore
|
||||
everything inside. Preserves all character offsets so ``re.search``
|
||||
on the masked string returns matches at the same positions as on the
|
||||
original.
|
||||
|
||||
Issue #201: `\\b` matches inside backtick segments because both `.`
|
||||
and `` ` `` are non-word characters. A registered bare-name like
|
||||
``unit_economics`` would otherwise match inside a user-supplied full
|
||||
backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
|
||||
falsely rewritten — corrupting the user's intended SQL.
|
||||
"""
|
||||
return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
|
||||
|
||||
|
||||
def _default_remote_query_cap_bytes() -> int:
|
||||
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
|
||||
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
|
||||
|
|
@ -197,11 +220,18 @@ def execute_query(
|
|||
if r.get("name") and r.get("id") in allowed_ids
|
||||
}
|
||||
|
||||
# Check if query references any forbidden tables (word-boundary match)
|
||||
# Check if query references any forbidden tables (word-boundary
|
||||
# match). Issue #201: mask backtick segments so `\b` doesn't
|
||||
# falsely fire inside a user-supplied full backtick path like
|
||||
# `<project>.<dataset>.<table>` whose final segment happens to
|
||||
# collide with a forbidden master view name. The full-path
|
||||
# registry-gate downstream is the proper authorization check
|
||||
# for those.
|
||||
sql_lower_masked = _mask_backticks(sql_lower)
|
||||
forbidden = all_views - allowed_view_names
|
||||
for table in forbidden:
|
||||
pattern = r'\b' + re.escape(table.lower()) + r'\b'
|
||||
if re.search(pattern, sql_lower):
|
||||
if re.search(pattern, sql_lower_masked):
|
||||
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
|
||||
|
||||
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
|
||||
|
|
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
|
|||
name_lookups: list = []
|
||||
seen_paths: set = set()
|
||||
accessible_set = set(allowed) if allowed is not None else None
|
||||
# Issue #201: mask backtick segments so a registered bare name like
|
||||
# `unit_economics` doesn't false-positive on a user-supplied full
|
||||
# backtick path `<project>.<dataset>.unit_economics`. The full-path
|
||||
# pass below registry-gates those properly.
|
||||
sql_lower_masked = _mask_backticks(sql_lower)
|
||||
for r in repo.list_by_source("bigquery"):
|
||||
if (r.get("query_mode") or "") != "remote":
|
||||
continue
|
||||
|
|
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
|
|||
# before we get here. Defensive skip.
|
||||
continue
|
||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||
if re.search(pattern, sql_lower):
|
||||
if re.search(pattern, sql_lower_masked):
|
||||
key = (bucket.lower(), source_table.lower())
|
||||
if key not in seen_paths:
|
||||
seen_paths.add(key)
|
||||
|
|
@ -579,6 +614,15 @@ def _rewrite_bq_table_refs_to_native(
|
|||
# name up in a case-insensitive dict. Single-pass means freshly
|
||||
# inserted backticked text isn't re-scanned, fixing the
|
||||
# project-ID-contains-name corruption (Devin Review on query.py:464).
|
||||
#
|
||||
# Issue #201: split the SQL on `…` segments and rewrite ONLY in the
|
||||
# outside-backtick chunks. Without this, a user-supplied full backtick
|
||||
# path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
|
||||
# segment matches a registered bare name would have the bare-name
|
||||
# regex fire INSIDE the backticks (since `\\b` treats both `.` and
|
||||
# `` ` `` as non-word boundaries), producing malformed nested
|
||||
# backticks. Splitting confines the rewrite to user identifier
|
||||
# positions where bare-name resolution is the intended behaviour.
|
||||
if name_lookups:
|
||||
# Map name (lower-cased) → backticked target. Names are
|
||||
# case-insensitive on the input side per the existing helper
|
||||
|
|
@ -598,7 +642,15 @@ def _rewrite_bq_table_refs_to_native(
|
|||
def _name_repl(m: re.Match) -> str:
|
||||
return name_to_target[m.group(1).lower()]
|
||||
|
||||
out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
|
||||
# `re.split` with a captured group returns: [outside, backtick,
|
||||
# outside, backtick, …]. Even indices are outside-backtick chunks
|
||||
# eligible for bare-name rewrite; odd indices are full backtick
|
||||
# segments preserved verbatim.
|
||||
parts = re.split(r'(`[^`]*`)', out)
|
||||
for i, part in enumerate(parts):
|
||||
if i % 2 == 0:
|
||||
parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
|
||||
out = "".join(parts)
|
||||
|
||||
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
|
||||
def _bq_path_repl(m: re.Match) -> str:
|
||||
|
|
|
|||
|
|
@ -432,3 +432,105 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
|
|||
)
|
||||
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
|
||||
rewritten.lower().count("`p.fin.ue`") == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
|
||||
# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
|
||||
# table segment matches a registered bare name was being re-substituted
|
||||
# inside the backticks, producing malformed nested-backtick SQL that BQ
|
||||
# rejected with a parse error.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_rewrite_skips_inside_backtick_path():
|
||||
"""Full backtick BQ path is preserved byte-for-byte even when its
|
||||
final segment matches a registered bare-name alias."""
|
||||
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||
|
||||
sql = (
|
||||
"SELECT * FROM `my-prj.finance.unit_economics` "
|
||||
"WHERE country = 'CZ'"
|
||||
)
|
||||
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||
sql=sql,
|
||||
name_lookups=[("unit_economics", "finance", "unit_economics")],
|
||||
project="my-prj",
|
||||
)
|
||||
# No corruption — input is already BQ-native, rewriter is a no-op here.
|
||||
assert rewritten == sql, (
|
||||
f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}"
|
||||
)
|
||||
# Sanity: the malformed nested form must NOT appear.
|
||||
assert "`my-prj.finance.`my-prj" not in rewritten
|
||||
|
||||
|
||||
def test_rewrite_skips_inside_backtick_with_outside_bare_name():
|
||||
"""Mixed SQL: a bare name outside backticks is rewritten as before,
|
||||
but an identically-named segment inside a backtick path is left
|
||||
alone."""
|
||||
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||
|
||||
sql = (
|
||||
"SELECT a.id, b.col FROM ue a "
|
||||
"JOIN `my-prj.finance.ue` b ON a.id = b.id"
|
||||
)
|
||||
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||
sql=sql,
|
||||
name_lookups=[("ue", "fin_alias", "ue_alias")],
|
||||
project="my-prj",
|
||||
)
|
||||
# Outside-backtick `ue` rewrites to the registered alias path.
|
||||
assert "`my-prj.fin_alias.ue_alias`" in rewritten
|
||||
# The user-supplied backtick path is preserved verbatim.
|
||||
assert "`my-prj.finance.ue`" in rewritten
|
||||
# The malformed nested form must NOT appear.
|
||||
assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
|
||||
|
||||
|
||||
def test_guardrail_skips_bare_name_match_inside_backticks(
|
||||
seeded_app, mock_dry_run, monkeypatch,
|
||||
):
|
||||
"""The `name_lookups` collection populated by `_bq_guardrail_inputs`
|
||||
must not include a registered name when the only place that name
|
||||
appears in the SQL is inside a `…` backtick segment.
|
||||
|
||||
Captures the rewritten SQL the guardrail forwards to the dry-run and
|
||||
asserts the bare-name was NOT substituted inside the user's backtick
|
||||
path.
|
||||
"""
|
||||
_register_bq_remote_row("unit_economics", "finance", "unit_economics")
|
||||
|
||||
captured = {"sql": None}
|
||||
|
||||
def capturing_fake(_bq, sql):
|
||||
captured["sql"] = sql
|
||||
return 1024
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
user_sql = (
|
||||
"SELECT * FROM `test-data-prj.finance.unit_economics` "
|
||||
"WHERE country = 'CZ'"
|
||||
)
|
||||
c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
|
||||
|
||||
sent = captured["sql"]
|
||||
if sent is None:
|
||||
# Guardrail decided no BQ tables were referenced — that's also
|
||||
# an acceptable "no false-positive" outcome (Layer 3 will cover
|
||||
# the explicit registry check for full backtick paths). We just
|
||||
# need to ensure the bare-name regex didn't fire.
|
||||
return
|
||||
# The user's exact backtick path must survive verbatim — no nested
|
||||
# backticks introduced by a stray bare-name rewrite.
|
||||
assert "`test-data-prj.finance.unit_economics`" in sent, (
|
||||
f"backtick path corrupted by guardrail:\n out: {sent!r}"
|
||||
)
|
||||
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
|
||||
f"nested-backtick corruption signature present: {sent!r}"
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in a new issue