Merge pull request #208 from keboola/zs/issue-201-rewriter-backtick

fix(query): rewriter respects backtick paths; tighten cap-guard fallback (#201)
This commit is contained in:
ZdenekSrotyr 2026-05-06 23:09:43 +02:00 committed by GitHub
commit e3494607bf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 586 additions and 87 deletions

View file

@ -10,6 +10,40 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
## [Unreleased]
## [0.42.0] — 2026-05-06
### Fixed
- `agnes query --remote`: full backtick BigQuery paths in user SQL are no
longer corrupted by the registered-name rewriter. Previously a query
like ``SELECT … FROM `<project>.<dataset>.<table>` WHERE …`` whose
table name happened to be registered as a bare-name alias would have
the alias re-substituted *inside* the backtick path, producing
malformed SQL that BigQuery rejected with a parse error. The cap-guard
then fell back to a filter-less `SELECT *` size estimate (often orders
of magnitude larger than the real scan), blocking the query as
`remote_scan_too_large`. Issue #201.
### Changed
- `agnes query --remote`: cap-guard fallback no longer estimates from
a synthetic `SELECT *` when the rewritten SQL fails dry-run. It first
retries the user's original SQL (handles BQ-native input cleanly), and
only when *that* also fails returns a structured `remote_estimate_failed`
HTTP 400 with a hint instead of silently over-estimating.
- **BREAKING (clients matching error kinds)**: failure to estimate
remote-query scan size now returns `kind="remote_estimate_failed"`
instead of being masked as `remote_scan_too_large` caused by
over-estimation. Operators that grep for the old kind in dashboards
should update.
### Security
- `agnes query --remote`: full backtick BigQuery paths are now
registry-gated identically to `bq."<dataset>"."<table>"` syntax.
Previously, full backtick paths bypassed Agnes RBAC entirely — only
the configured service account scope limited what users could query.
New `bq_path_cross_project` (when the project ≠ configured data
project) and `bq_path_not_registered` (when path is unknown) error
kinds. Issue #201.
## [0.41.0] — 2026-05-06
### Fixed

View file

@ -90,6 +90,29 @@ BQ_PATH = re.compile(
)
# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
# bare-name regexes from firing inside backtick-quoted segments.
_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
def _mask_backticks(sql: str) -> str:
"""Replace each `…`-quoted segment with spaces of equal length so
word-boundary regexes find positions outside backticks but ignore
everything inside. Preserves all character offsets so ``re.search``
on the masked string returns matches at the same positions as on the
original.
Issue #201: `\\b` matches inside backtick segments because both `.`
and `` ` `` are non-word characters. A registered bare-name like
``unit_economics`` would otherwise match inside a user-supplied full
backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
falsely rewritten corrupting the user's intended SQL.
"""
return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
def _default_remote_query_cap_bytes() -> int:
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config
@ -197,11 +220,18 @@ def execute_query(
if r.get("name") and r.get("id") in allowed_ids
}
# Check if query references any forbidden tables (word-boundary match)
# Check if query references any forbidden tables (word-boundary
# match). Issue #201: mask backtick segments so `\b` doesn't
# falsely fire inside a user-supplied full backtick path like
# `<project>.<dataset>.<table>` whose final segment happens to
# collide with a forbidden master view name. The full-path
# registry-gate downstream is the proper authorization check
# for those.
sql_lower_masked = _mask_backticks(sql_lower)
forbidden = all_views - allowed_view_names
for table in forbidden:
pattern = r'\b' + re.escape(table.lower()) + r'\b'
if re.search(pattern, sql_lower):
if re.search(pattern, sql_lower_masked):
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
name_lookups: list = []
seen_paths: set = set()
accessible_set = set(allowed) if allowed is not None else None
# Issue #201: mask backtick segments so a registered bare name like
# `unit_economics` doesn't false-positive on a user-supplied full
# backtick path `<project>.<dataset>.unit_economics`. The full-path
# pass below registry-gates those properly.
sql_lower_masked = _mask_backticks(sql_lower)
for r in repo.list_by_source("bigquery"):
if (r.get("query_mode") or "") != "remote":
continue
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
# before we get here. Defensive skip.
continue
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
if re.search(pattern, sql_lower):
if re.search(pattern, sql_lower_masked):
key = (bucket.lower(), source_table.lower())
if key not in seen_paths:
seen_paths.add(key)
@ -529,6 +564,66 @@ def _bq_guardrail_inputs(
seen_paths.add(key)
dry_run.append((bucket, source_table, 0))
# 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
# Pre-#201 these bypassed Agnes RBAC entirely — only the configured
# service account scope limited which tables a user could reach. Gate
# them identically to the `bq.<ds>.<tbl>` pass: must match the
# configured data project, must point at a registered row, and the
# caller must hold a grant on that row's id (admin bypasses the grant
# check but still requires registration + project match).
#
# Lazy `get_bq_access()` import via the module-level alias so tests
# can monkeypatch a fake. When BQ isn't configured (no data project),
# fall through silently — full backtick paths can't possibly resolve
# against this instance, so leave them to BQ to reject if a query
# somehow makes it through.
try:
bq = get_bq_access()
data_project = (bq.projects.data or "").strip()
except Exception:
data_project = ""
if data_project:
for m in _BACKTICK_FULL_PATH.finditer(sql):
proj, ds, tbl = m.group(1), m.group(2), m.group(3)
if proj.lower() != data_project.lower():
return [], [], {
"reason": "bq_path_cross_project",
"path": f"`{proj}.{ds}.{tbl}`",
"expected_project": data_project,
"hint": (
"--remote queries can only reference tables in the "
"configured BigQuery data project. Register "
"cross-project tables via `agnes admin "
"register-table` if needed."
),
}
row = repo.find_by_bq_path(ds, tbl)
if row is None:
return [], [], {
"reason": "bq_path_not_registered",
"path": f"`{proj}.{ds}.{tbl}`",
"hint": (
"Direct BigQuery paths must point to a registered "
"table. Register via `agnes admin register-table` "
"or use the registered name from `agnes catalog`."
),
}
if not is_admin:
if accessible_set is None or row["id"] not in accessible_set:
return [], [], {
"reason": "bq_path_access_denied",
"path": f"`{proj}.{ds}.{tbl}`",
"registered_as": row["name"],
}
bucket = row["bucket"]
source_table = row["source_table"]
if bucket and source_table:
key = (bucket.lower(), source_table.lower())
if key not in seen_paths:
seen_paths.add(key)
dry_run.append((bucket, source_table, 0))
return dry_run, name_lookups, None
@ -579,6 +674,15 @@ def _rewrite_bq_table_refs_to_native(
# name up in a case-insensitive dict. Single-pass means freshly
# inserted backticked text isn't re-scanned, fixing the
# project-ID-contains-name corruption (Devin Review on query.py:464).
#
# Issue #201: split the SQL on `…` segments and rewrite ONLY in the
# outside-backtick chunks. Without this, a user-supplied full backtick
# path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
# segment matches a registered bare name would have the bare-name
# regex fire INSIDE the backticks (since `\\b` treats both `.` and
# `` ` `` as non-word boundaries), producing malformed nested
# backticks. Splitting confines the rewrite to user identifier
# positions where bare-name resolution is the intended behaviour.
if name_lookups:
# Map name (lower-cased) → backticked target. Names are
# case-insensitive on the input side per the existing helper
@ -598,7 +702,15 @@ def _rewrite_bq_table_refs_to_native(
def _name_repl(m: re.Match) -> str:
return name_to_target[m.group(1).lower()]
out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
# `re.split` with a captured group returns: [outside, backtick,
# outside, backtick, …]. Even indices are outside-backtick chunks
# eligible for bare-name rewrite; odd indices are full backtick
# segments preserved verbatim.
parts = re.split(r'(`[^`]*`)', out)
for i, part in enumerate(parts):
if i % 2 == 0:
parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
out = "".join(parts)
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
def _bq_path_repl(m: re.Match) -> str:
@ -675,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
return user_sql, False
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`.
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
# bare-name regex must run against a backtick-masked copy so a
# registered name like ``orders`` doesn't false-positive when it
# appears as the table segment of a user-supplied full backtick path
# like ``\\`<project>.<dataset>.orders\\```. Without masking, the
# cross-source check below would falsely conclude the SQL touches
# both BQ-remote and local sources, dropping every backtick-path
# query into the 50-100× slower ATTACH-catalog fallback. Devin
# Review on PR #208.
sql_lower = user_sql.lower()
sql_lower_masked = _mask_backticks(sql_lower)
name_lookups: list = []
seen_paths: set = set()
@ -715,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
# mix rewritten and non-rewritten BQ paths in one query.
return user_sql, False
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
if re.search(pattern, sql_lower):
if re.search(pattern, sql_lower_masked):
key = (bucket.lower(), source_table.lower())
if key not in seen_paths:
seen_paths.add(key)
@ -752,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
# Same name registered both BQ-remote and local? Pathological;
# skip as a safety measure.
return user_sql, False
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower):
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
logger.info(
"rewrite_skip_cross_source: user SQL references both "
"BQ-remote and local-mode tables; falling back to "
@ -839,28 +960,32 @@ def _bq_quota_and_cap_guard(
partitioned/clustered tables and rejecting narrow queries that BQ
itself would dry-run as a few MB.
Fallback: if BQ rejects the rewritten SQL with a parse-level
``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that
don't translate to BQ), fall back to the pre-#171 per-table
SELECT * approach so the cap-guard still functions over-estimate
is preferred over fail-open. Forbidden / upstream errors still
propagate as HTTP 502.
Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level
``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or
a rewriter bug that broke valid BQ-native input), retry with the
user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the
original ALSO fails, return a structured `remote_estimate_failed`
HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table
over-estimate. The synthetic fallback threw away user filters and
routinely ballooned to "full table size", blocking legitimate narrow
queries via `remote_scan_too_large`. Forbidden / upstream errors
still propagate as HTTP 502.
Flow:
1. `check_daily_budget` over-cap users get 429 BEFORE any BQ work.
2. `quota.acquire(user_id)` opened concurrent-slot held throughout.
3. Single dry-run of rewritten user SQL `total_bytes`.
On parse error, fall back to per-table SELECT * sum.
On parse error, retry with the user's original SQL.
On second parse error, raise 400 `remote_estimate_failed`.
4. If total > cap 400 `remote_scan_too_large`.
5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
6. On exit, slot released.
Mutates `dry_run_set` in place: the third tuple element (bytes) is
populated so the caller can sum and record bytes against the user's
quota post-flight. Single-dry-run path puts `total_bytes` on the first
entry and zero on the rest (BQ doesn't expose per-table bytes for a
composite query); the caller's `sum(b for _, _, b in dry_run_set)`
still equals `total_bytes`.
quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest
BQ doesn't expose per-table bytes for a composite query — so
`sum(b for _, _, b in dry_run_set)` still equals `total_bytes`.
"""
quota = _build_quota_tracker()
try:
@ -901,61 +1026,71 @@ def _bq_quota_and_cap_guard(
sql, name_lookups, project,
)
# Try the single-dry-run path first (issue #171). Falls back
# to the per-table SELECT * approach only on BQ parse errors
# (kind="bq_bad_request" — DuckDB-only syntax that BQ can't
# translate). All other BQ errors propagate as 502 below.
# Try the single-dry-run path first (issue #171). On BQ parse
# errors (`bq_bad_request` — typically DuckDB-only syntax the
# rewriter couldn't translate, OR — pre-#201 fix — a
# rewriter-corrupted backtick path) retry the user's ORIGINAL
# SQL: when the user submitted BQ-native SQL, the rewriter is
# the only thing standing between them and a clean dry-run.
# If the original ALSO fails, this is true DuckDB-only syntax
# that BQ cannot estimate — fail fast with a structured
# `remote_estimate_failed` instead of the pre-#201 synthetic
# `SELECT *` over-estimate (which threw away user filters and
# often ballooned to "full table size", blocking legitimate
# narrow queries via `remote_scan_too_large`).
#
# All other BQ errors (forbidden, upstream) propagate as 502.
total_bytes = 0
used_fallback = False
try:
total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
except BqAccessError as exc:
if exc.kind == "bq_bad_request":
logger.warning(
"BQ dry-run rejected the rewritten SQL "
"(kind=%s, message=%s). Falling back to per-table "
"SELECT * estimate; the cap check will over-estimate "
"scan bytes for this query. Consider rewriting to "
"BQ-native syntax for a tight pre-check.",
exc.kind, exc.message,
)
used_fallback = True
else:
if exc.kind != "bq_bad_request":
raise HTTPException(status_code=502, detail={
"kind": exc.kind,
"message": exc.message,
**(exc.details or {}),
})
if used_fallback:
# Pre-#171 path: estimate per registered table from a
# synthetic SELECT *. Over-estimates partitioned scans but
# never under-estimates, so the cap still bounds risk.
for i, (bucket, source_table, _) in enumerate(dry_run_set):
fallback_sql = (
f"SELECT * FROM `{project}.{bucket}.{source_table}`"
)
try:
est = _bq_dry_run_bytes(bq, fallback_sql)
except BqAccessError as exc:
logger.warning(
"BQ dry-run rejected the rewritten SQL "
"(kind=%s, message=%s). Retrying with the user's "
"original SQL.",
exc.kind, exc.message,
)
try:
total_bytes = _bq_dry_run_bytes(bq, sql)
except BqAccessError as exc2:
if exc2.kind != "bq_bad_request":
raise HTTPException(status_code=502, detail={
"kind": exc.kind,
"message": exc.message,
**(exc.details or {}),
"kind": exc2.kind,
"message": exc2.message,
**(exc2.details or {}),
})
dry_run_set[i] = (bucket, source_table, est)
total_bytes += est
else:
# Single-dry-run path. Distribute the total to dry_run_set
# so the caller's `record_bytes(sum(...))` stays correct.
# Per-table breakdown is unavailable from a composite
# dry-run; pin total to entry 0, zero the rest.
if dry_run_set:
b0, t0, _ = dry_run_set[0]
dry_run_set[0] = (b0, t0, total_bytes)
for i in range(1, len(dry_run_set)):
bi, ti, _ = dry_run_set[i]
dry_run_set[i] = (bi, ti, 0)
raise HTTPException(status_code=400, detail={
"kind": "remote_estimate_failed",
"message": (
"Could not estimate scan size for this query."
),
"hint": (
"Use a registered table name from `agnes "
"catalog`, or write BQ-native SQL with full "
"backtick paths. Pure DuckDB-only syntax is "
"not supported for --remote queries."
),
"underlying": exc2.message,
})
# Distribute the total to dry_run_set so the caller's
# `record_bytes(sum(...))` stays correct. Per-table breakdown
# is unavailable from a composite dry-run; pin total to entry
# 0, zero the rest. (Same accounting symmetry whether the
# bytes came from the rewritten SQL or the original-SQL
# retry.)
if dry_run_set:
b0, t0, _ = dry_run_set[0]
dry_run_set[0] = (b0, t0, total_bytes)
for i in range(1, len(dry_run_set)):
bi, ti, _ = dry_run_set[i]
dry_run_set[i] = (bi, ti, 0)
if cap_bytes > 0 and total_bytes > cap_bytes:
tables = [f"{b}.{t}" for b, t, _ in dry_run_set]

View file

@ -1,6 +1,6 @@
[project]
name = "agnes-the-ai-analyst"
version = "0.41.0"
version = "0.42.0"
description = "Agnes — AI Data Analyst platform for AI analytical systems"
requires-python = ">=3.11,<3.14"
license = "MIT"

View file

@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request(
assert "`test-data-prj.marketing.traffic`" in state["last_sql"]
def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
def test_fallback_tries_original_sql_first(
seeded_app, mock_dry_run, monkeypatch,
):
"""When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only
syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …),
the cap-guard falls back to the pre-#171 per-table SELECT * approach
so a non-portable query still gets a (loose) cap estimate instead of
fail-opening.
"""
"""Issue #201 — when the rewriter produces SQL that BQ rejects with
`bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the
cap-guard uses the original SQL's byte estimate. No more synthetic
`SELECT *` over-estimate.
Bare-name reference populates `dry_run_set` so the cap-guard
actually fires. Mock returns parse-error on the first call
(rewritten SQL) and small bytes on the second (original)."""
from connectors.bigquery.access import BqAccessError
_register_bq_remote_row("ue", "finance", "ue")
@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
def fake_dry_run(_bq, sql):
state["calls"].append(sql)
# First call (rewritten user SQL) → BQ parse error.
# First call (rewritten SQL) → BQ parse error.
if len(state["calls"]) == 1:
raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
# Second call (fallback per-table SELECT *) → small bytes, pass cap.
raise BqAccessError("bq_bad_request", "Syntax error: simulated")
# Second call (the user's original SQL) → small, passes cap.
return 4096
monkeypatch.setattr(
@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
c = seeded_app["client"]
token = seeded_app["admin_token"]
# SQL with DuckDB-only `::INT` cast that BQ would reject.
user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'"
r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
# Two dry-runs: rewritten then original. No third synthetic-SELECT-*
# call.
assert len(state["calls"]) == 2, (
f"expected rewritten + original-SQL retry, got "
f"{len(state['calls'])}: {state['calls']}"
)
assert state["calls"][1] == user_sql, (
f"second call must be the user's ORIGINAL SQL, got "
f"{state['calls'][1]!r}"
)
# The response must NOT be remote_scan_too_large from a synthetic
# over-estimate — 4096 bytes is well under the 5 GiB cap.
if r.status_code == 400:
detail = r.json().get("detail", {})
if isinstance(detail, dict):
assert detail.get("reason") != "remote_scan_too_large", detail
def test_fallback_fails_fast_on_pure_duckdb_syntax(
seeded_app, mock_dry_run, monkeypatch,
):
"""When BOTH the rewritten and original SQL fail with `bq_bad_request`
(true DuckDB-only syntax like `::INT`), return HTTP 400
`remote_estimate_failed` never silently over-estimate via a
synthetic `SELECT *`."""
from connectors.bigquery.access import BqAccessError
_register_bq_remote_row("ue", "finance", "ue")
state = {"calls": []}
def always_parse_error(_bq, sql):
state["calls"].append(sql)
raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
monkeypatch.setattr(
"app.api.query._bq_dry_run_bytes", always_parse_error, raising=False,
)
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
headers=_auth(token),
)
# Two dry-runs (rewritten + fallback per-table) before the (failed)
# execute. Status will be a downstream error from analytics.execute()
# since `::INT` doesn't work in DuckDB either against a remote view —
# but the GUARDRAIL must have completed without 5xx-ing.
# Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback.
assert len(state["calls"]) == 2, (
f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: "
f"{state['calls']}"
f"expected 1 rewritten + 1 original-retry, got "
f"{len(state['calls'])}: {state['calls']}"
)
assert "::" in state["calls"][0], "first call should be the rewritten user SQL"
assert state["calls"][1].startswith("SELECT * FROM"), (
"second call should be the per-table fallback"
)
# Whatever HTTP status comes back must NOT be 502 from the guard's
# transport-error path — fallback must absorb the bq_bad_request.
assert r.status_code != 502, r.json()
# No call should be a synthetic ``SELECT * FROM `<project>...```. The
# original-SQL retry contains the user's SELECT clause.
for c_sql in state["calls"]:
# If a call is just a synthetic ``SELECT * FROM `<project>.<bucket>.<table>```
# the user's `WHERE country = 'CZ'` would be missing.
if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql:
raise AssertionError(
f"synthetic SELECT * fallback was used: {c_sql!r}"
)
assert r.status_code == 400, r.json()
detail = r.json().get("detail", {})
assert isinstance(detail, dict), detail
assert detail.get("kind") == "remote_estimate_failed", detail
assert "underlying" in detail, detail
assert "agnes catalog" in detail.get("hint", "").lower() or \
"backtick" in detail.get("hint", "").lower(), detail
def test_guardrail_propagates_502_on_non_parse_bq_errors(
@ -432,3 +484,243 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
)
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
rewritten.lower().count("`p.fin.ue`") == 2
# ---------------------------------------------------------------------------
# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
# table segment matches a registered bare name was being re-substituted
# inside the backticks, producing malformed nested-backtick SQL that BQ
# rejected with a parse error.
# ---------------------------------------------------------------------------
def test_rewrite_skips_inside_backtick_path():
"""Full backtick BQ path is preserved byte-for-byte even when its
final segment matches a registered bare-name alias."""
from app.api.query import _rewrite_user_sql_for_bq_dry_run
sql = (
"SELECT * FROM `my-prj.finance.unit_economics` "
"WHERE country = 'CZ'"
)
rewritten = _rewrite_user_sql_for_bq_dry_run(
sql=sql,
name_lookups=[("unit_economics", "finance", "unit_economics")],
project="my-prj",
)
# No corruption — input is already BQ-native, rewriter is a no-op here.
assert rewritten == sql, (
f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}"
)
# Sanity: the malformed nested form must NOT appear.
assert "`my-prj.finance.`my-prj" not in rewritten
def test_rewrite_skips_inside_backtick_with_outside_bare_name():
"""Mixed SQL: a bare name outside backticks is rewritten as before,
but an identically-named segment inside a backtick path is left
alone."""
from app.api.query import _rewrite_user_sql_for_bq_dry_run
sql = (
"SELECT a.id, b.col FROM ue a "
"JOIN `my-prj.finance.ue` b ON a.id = b.id"
)
rewritten = _rewrite_user_sql_for_bq_dry_run(
sql=sql,
name_lookups=[("ue", "fin_alias", "ue_alias")],
project="my-prj",
)
# Outside-backtick `ue` rewrites to the registered alias path.
assert "`my-prj.fin_alias.ue_alias`" in rewritten
# The user-supplied backtick path is preserved verbatim.
assert "`my-prj.finance.ue`" in rewritten
# The malformed nested form must NOT appear.
assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
def test_guardrail_skips_bare_name_match_inside_backticks(
seeded_app, mock_dry_run, monkeypatch,
):
"""The `name_lookups` collection populated by `_bq_guardrail_inputs`
must not include a registered name when the only place that name
appears in the SQL is inside a `` backtick segment.
Captures the rewritten SQL the guardrail forwards to the dry-run and
asserts the bare-name was NOT substituted inside the user's backtick
path.
"""
_register_bq_remote_row("unit_economics", "finance", "unit_economics")
captured = {"sql": None}
def capturing_fake(_bq, sql):
captured["sql"] = sql
return 1024
monkeypatch.setattr(
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
)
c = seeded_app["client"]
token = seeded_app["admin_token"]
user_sql = (
"SELECT * FROM `test-data-prj.finance.unit_economics` "
"WHERE country = 'CZ'"
)
c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
sent = captured["sql"]
if sent is None:
# Guardrail decided no BQ tables were referenced — that's also
# an acceptable "no false-positive" outcome (Layer 3 will cover
# the explicit registry check for full backtick paths). We just
# need to ensure the bare-name regex didn't fire.
return
# The user's exact backtick path must survive verbatim — no nested
# backticks introduced by a stray bare-name rewrite.
assert "`test-data-prj.finance.unit_economics`" in sent, (
f"backtick path corrupted by guardrail:\n out: {sent!r}"
)
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
f"nested-backtick corruption signature present: {sent!r}"
)
# ---------------------------------------------------------------------------
# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
# Pre-fix these bypassed Agnes RBAC entirely — only the configured service
# account scope limited which tables a user could reach. Post-fix, they're
# treated identically to `bq."<dataset>"."<table>"` syntax.
# ---------------------------------------------------------------------------
def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
"""Full backtick path to an unregistered `<dataset>.<table>` (project
matches the configured data project) HTTP 403 with
`bq_path_not_registered`."""
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": (
"SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
"WHERE country = 'CZ'"
),
},
headers=_auth(token),
)
assert r.status_code == 403, r.json()
detail = r.json().get("detail", {})
assert isinstance(detail, dict), detail
assert detail.get("reason") == "bq_path_not_registered", detail
assert "secret_ds" in detail.get("path", ""), detail
assert "secret_tbl" in detail.get("path", ""), detail
def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
"""Full backtick path with project ≠ configured data project → HTTP
403 with `bq_path_cross_project`. Even if the path happens to point
at a registered (bucket, source_table), the project mismatch is the
primary boundary."""
_register_bq_remote_row("ue", "finance", "ue")
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
},
headers=_auth(token),
)
assert r.status_code == 403, r.json()
detail = r.json().get("detail", {})
assert isinstance(detail, dict), detail
assert detail.get("reason") == "bq_path_cross_project", detail
assert detail.get("expected_project") == "test-data-prj", detail
assert "other-project" in detail.get("path", ""), detail
def test_full_backtick_path_registered_admin_passes(
seeded_app, mock_dry_run, monkeypatch,
):
"""Admin caller + registered path + matching project → no RBAC
rejection. The dry-run fires (we can capture the SQL the guardrail
forwards) and no `bq_path_*` reason appears in any error response."""
_register_bq_remote_row("ue", "finance", "ue")
captured = {"sql": None}
def capturing_fake(_bq, sql):
captured["sql"] = sql
return 1024 # tiny — pass cap
monkeypatch.setattr(
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
)
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
},
headers=_auth(token),
)
# If 403, must NOT be the issue-#201 bq_path_* reasons.
if r.status_code == 403:
detail = r.json().get("detail", {})
if isinstance(detail, dict):
assert detail.get("reason") not in (
"bq_path_not_registered",
"bq_path_access_denied",
"bq_path_cross_project",
), f"admin + registered path should pass RBAC: {detail}"
# The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
# and the cap-guard fired. The user's WHERE clause must still be in
# the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
assert captured["sql"] is not None, (
"dry-run never fired — Pass 3 may not have registered the path"
)
assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
assert "WHERE id = 1" in captured["sql"], captured["sql"]
def test_full_backtick_path_inside_string_literal_not_gated(
seeded_app, mock_dry_run,
):
"""Defensive case: a backtick path appearing inside a SQL string
literal (rare but possible) should not trigger Pass 3. Practically
this is unreachable because backticks aren't typically valid inside
BQ string literals but the regex doesn't know that. We document
that the gate applies to ALL backtick triples to be safe; users who
really need a literal can use single-quoted strings without
backticks."""
# No registration; the test confirms an unregistered path inside
# what looks like a string is still gated. This is the conservative
# boundary — false-positive on string literal beats false-negative
# on a real RBAC bypass.
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": (
"SELECT 'matches `test-data-prj.x.y`' AS lit"
),
},
headers=_auth(token),
)
# Either gated (403) or 200 if the analytics DB happens to evaluate
# the literal — both are acceptable. The point is no silent RBAC
# bypass: if the response is 200, no BQ table was reached.
if r.status_code == 403:
detail = r.json().get("detail", {})
if isinstance(detail, dict):
assert detail.get("reason") in (
"bq_path_not_registered",
"bq_path_cross_project",
), detail

View file

@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
assert rewritten == user_sql # untouched
def test_local_name_inside_backtick_path_does_not_trip_cross_source(
seeded_registry, monkeypatch,
):
"""Devin Review on PR #208 (issue #201 follow-up): a registered
LOCAL-mode table name appearing as a segment of a user-supplied full
backtick BQ path must NOT trip the cross-source guard. Pre-fix the
bare-name regex at the cross-source check ran against unmasked
sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
local ``orders`` inside the backticks and force the wrapper to bail
to the ATTACH-catalog slow path (50-100× slower). Post-fix the
regex runs against the backtick-masked copy, the cross-source check
correctly sees only BQ refs, and the wrap proceeds.
"""
from app.api.query import _rewrite_user_sql_for_bigquery_query
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
bucket="fin", source_table="ue")
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
_set_bq_project(monkeypatch, "test-prj")
user_sql = (
"SELECT u.id "
"FROM ue u "
"JOIN `test-prj.dataset.orders` o ON u.x = o.x "
"WHERE o.value > 0"
)
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
user_sql, seeded_registry,
)
# Must wrap — both refs are BQ; the local `orders` registration is
# irrelevant to a query that touches only BQ paths.
assert did_rewrite is True
assert "bigquery_query(" in rewritten
# The user's backtick path is preserved verbatim inside the wrapped
# inner SQL (Layer 1 split-on-backticks behaviour), so the original
# `test-prj.dataset.orders` reference survives.
assert "test-prj.dataset.orders" in rewritten
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
"""User SQL referencing only local-source tables → no rewrite,
no log spam, original SQL returned."""