fix(query): rewriter respects backtick segments (#201)
`agnes query --remote` corrupted user SQL when the request contained a full BigQuery backtick path (`<project>.<dataset>.<table>`) whose table segment matched a registered bare-name alias. The bare-name rewriter used `\b` word-boundary matching against the lower-cased SQL; both `.` and `` ` `` are non-word characters, so the regex fired INSIDE the user's backtick path and produced malformed nested-backtick SQL that BigQuery rejected at parse time. Fix: - Add `_mask_backticks(sql)` helper: replace each `…` segment with spaces of equal length, preserving offsets so word-boundary searches find positions only outside backticks. - `_bq_guardrail_inputs` (bare-name pass + forbidden-table pass) searches against the masked SQL. - `_rewrite_bq_table_refs_to_native` Pass 1 splits the SQL on `(\`[^\`]*\`)` and rewrites only the outside-backtick chunks. Pass 2 (`bq."ds"."tbl"` → backtick form) is unchanged — its prefix can't appear inside backticks. Adds three regressions covering the rewrite + guardrail paths.
This commit is contained in:
parent
1b49de1568
commit
720a2180c0
2 changed files with 158 additions and 4 deletions
|
|
@ -90,6 +90,29 @@ BQ_PATH = re.compile(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
|
||||||
|
# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
|
||||||
|
# bare-name regexes from firing inside backtick-quoted segments.
|
||||||
|
_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
|
||||||
|
_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_backticks(sql: str) -> str:
|
||||||
|
"""Replace each `…`-quoted segment with spaces of equal length so
|
||||||
|
word-boundary regexes find positions outside backticks but ignore
|
||||||
|
everything inside. Preserves all character offsets so ``re.search``
|
||||||
|
on the masked string returns matches at the same positions as on the
|
||||||
|
original.
|
||||||
|
|
||||||
|
Issue #201: `\\b` matches inside backtick segments because both `.`
|
||||||
|
and `` ` `` are non-word characters. A registered bare-name like
|
||||||
|
``unit_economics`` would otherwise match inside a user-supplied full
|
||||||
|
backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
|
||||||
|
falsely rewritten — corrupting the user's intended SQL.
|
||||||
|
"""
|
||||||
|
return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
|
||||||
|
|
||||||
|
|
||||||
def _default_remote_query_cap_bytes() -> int:
|
def _default_remote_query_cap_bytes() -> int:
|
||||||
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
|
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
|
||||||
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
|
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
|
||||||
|
|
@ -197,11 +220,18 @@ def execute_query(
|
||||||
if r.get("name") and r.get("id") in allowed_ids
|
if r.get("name") and r.get("id") in allowed_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check if query references any forbidden tables (word-boundary match)
|
# Check if query references any forbidden tables (word-boundary
|
||||||
|
# match). Issue #201: mask backtick segments so `\b` doesn't
|
||||||
|
# falsely fire inside a user-supplied full backtick path like
|
||||||
|
# `<project>.<dataset>.<table>` whose final segment happens to
|
||||||
|
# collide with a forbidden master view name. The full-path
|
||||||
|
# registry-gate downstream is the proper authorization check
|
||||||
|
# for those.
|
||||||
|
sql_lower_masked = _mask_backticks(sql_lower)
|
||||||
forbidden = all_views - allowed_view_names
|
forbidden = all_views - allowed_view_names
|
||||||
for table in forbidden:
|
for table in forbidden:
|
||||||
pattern = r'\b' + re.escape(table.lower()) + r'\b'
|
pattern = r'\b' + re.escape(table.lower()) + r'\b'
|
||||||
if re.search(pattern, sql_lower):
|
if re.search(pattern, sql_lower_masked):
|
||||||
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
|
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
|
||||||
|
|
||||||
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
|
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
|
||||||
|
|
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
|
||||||
name_lookups: list = []
|
name_lookups: list = []
|
||||||
seen_paths: set = set()
|
seen_paths: set = set()
|
||||||
accessible_set = set(allowed) if allowed is not None else None
|
accessible_set = set(allowed) if allowed is not None else None
|
||||||
|
# Issue #201: mask backtick segments so a registered bare name like
|
||||||
|
# `unit_economics` doesn't false-positive on a user-supplied full
|
||||||
|
# backtick path `<project>.<dataset>.unit_economics`. The full-path
|
||||||
|
# pass below registry-gates those properly.
|
||||||
|
sql_lower_masked = _mask_backticks(sql_lower)
|
||||||
for r in repo.list_by_source("bigquery"):
|
for r in repo.list_by_source("bigquery"):
|
||||||
if (r.get("query_mode") or "") != "remote":
|
if (r.get("query_mode") or "") != "remote":
|
||||||
continue
|
continue
|
||||||
|
|
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
|
||||||
# before we get here. Defensive skip.
|
# before we get here. Defensive skip.
|
||||||
continue
|
continue
|
||||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||||
if re.search(pattern, sql_lower):
|
if re.search(pattern, sql_lower_masked):
|
||||||
key = (bucket.lower(), source_table.lower())
|
key = (bucket.lower(), source_table.lower())
|
||||||
if key not in seen_paths:
|
if key not in seen_paths:
|
||||||
seen_paths.add(key)
|
seen_paths.add(key)
|
||||||
|
|
@ -579,6 +614,15 @@ def _rewrite_bq_table_refs_to_native(
|
||||||
# name up in a case-insensitive dict. Single-pass means freshly
|
# name up in a case-insensitive dict. Single-pass means freshly
|
||||||
# inserted backticked text isn't re-scanned, fixing the
|
# inserted backticked text isn't re-scanned, fixing the
|
||||||
# project-ID-contains-name corruption (Devin Review on query.py:464).
|
# project-ID-contains-name corruption (Devin Review on query.py:464).
|
||||||
|
#
|
||||||
|
# Issue #201: split the SQL on `…` segments and rewrite ONLY in the
|
||||||
|
# outside-backtick chunks. Without this, a user-supplied full backtick
|
||||||
|
# path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
|
||||||
|
# segment matches a registered bare name would have the bare-name
|
||||||
|
# regex fire INSIDE the backticks (since `\\b` treats both `.` and
|
||||||
|
# `` ` `` as non-word boundaries), producing malformed nested
|
||||||
|
# backticks. Splitting confines the rewrite to user identifier
|
||||||
|
# positions where bare-name resolution is the intended behaviour.
|
||||||
if name_lookups:
|
if name_lookups:
|
||||||
# Map name (lower-cased) → backticked target. Names are
|
# Map name (lower-cased) → backticked target. Names are
|
||||||
# case-insensitive on the input side per the existing helper
|
# case-insensitive on the input side per the existing helper
|
||||||
|
|
@ -598,7 +642,15 @@ def _rewrite_bq_table_refs_to_native(
|
||||||
def _name_repl(m: re.Match) -> str:
|
def _name_repl(m: re.Match) -> str:
|
||||||
return name_to_target[m.group(1).lower()]
|
return name_to_target[m.group(1).lower()]
|
||||||
|
|
||||||
out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
|
# `re.split` with a captured group returns: [outside, backtick,
|
||||||
|
# outside, backtick, …]. Even indices are outside-backtick chunks
|
||||||
|
# eligible for bare-name rewrite; odd indices are full backtick
|
||||||
|
# segments preserved verbatim.
|
||||||
|
parts = re.split(r'(`[^`]*`)', out)
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i % 2 == 0:
|
||||||
|
parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
|
||||||
|
out = "".join(parts)
|
||||||
|
|
||||||
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
|
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
|
||||||
def _bq_path_repl(m: re.Match) -> str:
|
def _bq_path_repl(m: re.Match) -> str:
|
||||||
|
|
|
||||||
|
|
@ -432,3 +432,105 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
|
||||||
)
|
)
|
||||||
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
|
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
|
||||||
rewritten.lower().count("`p.fin.ue`") == 2
|
rewritten.lower().count("`p.fin.ue`") == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
|
||||||
|
# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
|
||||||
|
# table segment matches a registered bare name was being re-substituted
|
||||||
|
# inside the backticks, producing malformed nested-backtick SQL that BQ
|
||||||
|
# rejected with a parse error.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_rewrite_skips_inside_backtick_path():
|
||||||
|
"""Full backtick BQ path is preserved byte-for-byte even when its
|
||||||
|
final segment matches a registered bare-name alias."""
|
||||||
|
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||||
|
|
||||||
|
sql = (
|
||||||
|
"SELECT * FROM `my-prj.finance.unit_economics` "
|
||||||
|
"WHERE country = 'CZ'"
|
||||||
|
)
|
||||||
|
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||||
|
sql=sql,
|
||||||
|
name_lookups=[("unit_economics", "finance", "unit_economics")],
|
||||||
|
project="my-prj",
|
||||||
|
)
|
||||||
|
# No corruption — input is already BQ-native, rewriter is a no-op here.
|
||||||
|
assert rewritten == sql, (
|
||||||
|
f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}"
|
||||||
|
)
|
||||||
|
# Sanity: the malformed nested form must NOT appear.
|
||||||
|
assert "`my-prj.finance.`my-prj" not in rewritten
|
||||||
|
|
||||||
|
|
||||||
|
def test_rewrite_skips_inside_backtick_with_outside_bare_name():
|
||||||
|
"""Mixed SQL: a bare name outside backticks is rewritten as before,
|
||||||
|
but an identically-named segment inside a backtick path is left
|
||||||
|
alone."""
|
||||||
|
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||||
|
|
||||||
|
sql = (
|
||||||
|
"SELECT a.id, b.col FROM ue a "
|
||||||
|
"JOIN `my-prj.finance.ue` b ON a.id = b.id"
|
||||||
|
)
|
||||||
|
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||||
|
sql=sql,
|
||||||
|
name_lookups=[("ue", "fin_alias", "ue_alias")],
|
||||||
|
project="my-prj",
|
||||||
|
)
|
||||||
|
# Outside-backtick `ue` rewrites to the registered alias path.
|
||||||
|
assert "`my-prj.fin_alias.ue_alias`" in rewritten
|
||||||
|
# The user-supplied backtick path is preserved verbatim.
|
||||||
|
assert "`my-prj.finance.ue`" in rewritten
|
||||||
|
# The malformed nested form must NOT appear.
|
||||||
|
assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
|
||||||
|
|
||||||
|
|
||||||
|
def test_guardrail_skips_bare_name_match_inside_backticks(
|
||||||
|
seeded_app, mock_dry_run, monkeypatch,
|
||||||
|
):
|
||||||
|
"""The `name_lookups` collection populated by `_bq_guardrail_inputs`
|
||||||
|
must not include a registered name when the only place that name
|
||||||
|
appears in the SQL is inside a `…` backtick segment.
|
||||||
|
|
||||||
|
Captures the rewritten SQL the guardrail forwards to the dry-run and
|
||||||
|
asserts the bare-name was NOT substituted inside the user's backtick
|
||||||
|
path.
|
||||||
|
"""
|
||||||
|
_register_bq_remote_row("unit_economics", "finance", "unit_economics")
|
||||||
|
|
||||||
|
captured = {"sql": None}
|
||||||
|
|
||||||
|
def capturing_fake(_bq, sql):
|
||||||
|
captured["sql"] = sql
|
||||||
|
return 1024
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
user_sql = (
|
||||||
|
"SELECT * FROM `test-data-prj.finance.unit_economics` "
|
||||||
|
"WHERE country = 'CZ'"
|
||||||
|
)
|
||||||
|
c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
|
||||||
|
|
||||||
|
sent = captured["sql"]
|
||||||
|
if sent is None:
|
||||||
|
# Guardrail decided no BQ tables were referenced — that's also
|
||||||
|
# an acceptable "no false-positive" outcome (Layer 3 will cover
|
||||||
|
# the explicit registry check for full backtick paths). We just
|
||||||
|
# need to ensure the bare-name regex didn't fire.
|
||||||
|
return
|
||||||
|
# The user's exact backtick path must survive verbatim — no nested
|
||||||
|
# backticks introduced by a stray bare-name rewrite.
|
||||||
|
assert "`test-data-prj.finance.unit_economics`" in sent, (
|
||||||
|
f"backtick path corrupted by guardrail:\n out: {sent!r}"
|
||||||
|
)
|
||||||
|
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
|
||||||
|
f"nested-backtick corruption signature present: {sent!r}"
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue