feat(query): registry-gate full backtick BigQuery paths (#201)

Adds Pass 3 to `_bq_guardrail_inputs` that scans user SQL for full
backtick paths `<project>.<dataset>.<table>` and gates them
identically to the `bq."<dataset>"."<table>"` pass:

- Project must match the configured BigQuery data project
  (`get_bq_access().projects.data`). Mismatch → HTTP 403
  `bq_path_cross_project`.
- Path must point at a registered row. Unregistered → HTTP 403
  `bq_path_not_registered`.
- Non-admin caller must hold a grant on the registered row's id.
  Missing grant → HTTP 403 `bq_path_access_denied`.

Pre-fix, full backtick paths bypassed Agnes RBAC entirely — only the
service account scope limited reach. Post-fix the boundary matches
what `agnes catalog`-driven flows already enforce. Admin still
bypasses the per-id grant check but cannot bypass registration or
project match.

Pass 3 also seeds `dry_run_set` for resolved registered paths so the
cost-cap dry-run runs against the same physical table the user named
— composing cleanly with the Layer 2 fail-fast fallback.
This commit is contained in:
ZdenekSrotyr 2026-05-06 17:54:07 +02:00
parent c32be3fe96
commit 824e3cb636
2 changed files with 198 additions and 0 deletions

View file

@ -564,6 +564,66 @@ def _bq_guardrail_inputs(
seen_paths.add(key)
dry_run.append((bucket, source_table, 0))
# 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
# Pre-#201 these bypassed Agnes RBAC entirely — only the configured
# service account scope limited which tables a user could reach. Gate
# them identically to the `bq.<ds>.<tbl>` pass: must match the
# configured data project, must point at a registered row, and the
# caller must hold a grant on that row's id (admin bypasses the grant
# check but still requires registration + project match).
#
# Lazy `get_bq_access()` import via the module-level alias so tests
# can monkeypatch a fake. When BQ isn't configured (no data project),
# fall through silently — full backtick paths can't possibly resolve
# against this instance, so leave them to BQ to reject if a query
# somehow makes it through.
try:
bq = get_bq_access()
data_project = (bq.projects.data or "").strip()
except Exception:
data_project = ""
if data_project:
for m in _BACKTICK_FULL_PATH.finditer(sql):
proj, ds, tbl = m.group(1), m.group(2), m.group(3)
if proj.lower() != data_project.lower():
return [], [], {
"reason": "bq_path_cross_project",
"path": f"`{proj}.{ds}.{tbl}`",
"expected_project": data_project,
"hint": (
"--remote queries can only reference tables in the "
"configured BigQuery data project. Register "
"cross-project tables via `agnes admin "
"register-table` if needed."
),
}
row = repo.find_by_bq_path(ds, tbl)
if row is None:
return [], [], {
"reason": "bq_path_not_registered",
"path": f"`{proj}.{ds}.{tbl}`",
"hint": (
"Direct BigQuery paths must point to a registered "
"table. Register via `agnes admin register-table` "
"or use the registered name from `agnes catalog`."
),
}
if not is_admin:
if accessible_set is None or row["id"] not in accessible_set:
return [], [], {
"reason": "bq_path_access_denied",
"path": f"`{proj}.{ds}.{tbl}`",
"registered_as": row["name"],
}
bucket = row["bucket"]
source_table = row["source_table"]
if bucket and source_table:
key = (bucket.lower(), source_table.lower())
if key not in seen_paths:
seen_paths.add(key)
dry_run.append((bucket, source_table, 0))
return dry_run, name_lookups, None

View file

@ -586,3 +586,141 @@ def test_guardrail_skips_bare_name_match_inside_backticks(
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
f"nested-backtick corruption signature present: {sent!r}"
)
# ---------------------------------------------------------------------------
# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
# Pre-fix these bypassed Agnes RBAC entirely — only the configured service
# account scope limited which tables a user could reach. Post-fix, they're
# treated identically to `bq."<dataset>"."<table>"` syntax.
# ---------------------------------------------------------------------------
def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
"""Full backtick path to an unregistered `<dataset>.<table>` (project
matches the configured data project) HTTP 403 with
`bq_path_not_registered`."""
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": (
"SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
"WHERE country = 'CZ'"
),
},
headers=_auth(token),
)
assert r.status_code == 403, r.json()
detail = r.json().get("detail", {})
assert isinstance(detail, dict), detail
assert detail.get("reason") == "bq_path_not_registered", detail
assert "secret_ds" in detail.get("path", ""), detail
assert "secret_tbl" in detail.get("path", ""), detail
def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
"""Full backtick path with project ≠ configured data project → HTTP
403 with `bq_path_cross_project`. Even if the path happens to point
at a registered (bucket, source_table), the project mismatch is the
primary boundary."""
_register_bq_remote_row("ue", "finance", "ue")
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
},
headers=_auth(token),
)
assert r.status_code == 403, r.json()
detail = r.json().get("detail", {})
assert isinstance(detail, dict), detail
assert detail.get("reason") == "bq_path_cross_project", detail
assert detail.get("expected_project") == "test-data-prj", detail
assert "other-project" in detail.get("path", ""), detail
def test_full_backtick_path_registered_admin_passes(
seeded_app, mock_dry_run, monkeypatch,
):
"""Admin caller + registered path + matching project → no RBAC
rejection. The dry-run fires (we can capture the SQL the guardrail
forwards) and no `bq_path_*` reason appears in any error response."""
_register_bq_remote_row("ue", "finance", "ue")
captured = {"sql": None}
def capturing_fake(_bq, sql):
captured["sql"] = sql
return 1024 # tiny — pass cap
monkeypatch.setattr(
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
)
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
},
headers=_auth(token),
)
# If 403, must NOT be the issue-#201 bq_path_* reasons.
if r.status_code == 403:
detail = r.json().get("detail", {})
if isinstance(detail, dict):
assert detail.get("reason") not in (
"bq_path_not_registered",
"bq_path_access_denied",
"bq_path_cross_project",
), f"admin + registered path should pass RBAC: {detail}"
# The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
# and the cap-guard fired. The user's WHERE clause must still be in
# the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
assert captured["sql"] is not None, (
"dry-run never fired — Pass 3 may not have registered the path"
)
assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
assert "WHERE id = 1" in captured["sql"], captured["sql"]
def test_full_backtick_path_inside_string_literal_not_gated(
seeded_app, mock_dry_run,
):
"""Defensive case: a backtick path appearing inside a SQL string
literal (rare but possible) should not trigger Pass 3. Practically
this is unreachable because backticks aren't typically valid inside
BQ string literals but the regex doesn't know that. We document
that the gate applies to ALL backtick triples to be safe; users who
really need a literal can use single-quoted strings without
backticks."""
# No registration; the test confirms an unregistered path inside
# what looks like a string is still gated. This is the conservative
# boundary — false-positive on string literal beats false-negative
# on a real RBAC bypass.
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={
"sql": (
"SELECT 'matches `test-data-prj.x.y`' AS lit"
),
},
headers=_auth(token),
)
# Either gated (403) or 200 if the analytics DB happens to evaluate
# the literal — both are acceptable. The point is no silent RBAC
# bypass: if the response is 200, no BQ table was reached.
if r.status_code == 403:
detail = r.json().get("detail", {})
if isinstance(detail, dict):
assert detail.get("reason") in (
"bq_path_not_registered",
"bq_path_cross_project",
), detail