feat(query): registry-gate full backtick BigQuery paths (#201)
Adds Pass 3 to `_bq_guardrail_inputs` that scans user SQL for full backtick paths `<project>.<dataset>.<table>` and gates them identically to the `bq."<dataset>"."<table>"` pass: - Project must match the configured BigQuery data project (`get_bq_access().projects.data`). Mismatch → HTTP 403 `bq_path_cross_project`. - Path must point at a registered row. Unregistered → HTTP 403 `bq_path_not_registered`. - Non-admin caller must hold a grant on the registered row's id. Missing grant → HTTP 403 `bq_path_access_denied`. Pre-fix, full backtick paths bypassed Agnes RBAC entirely — only the service account scope limited reach. Post-fix the boundary matches what `agnes catalog`-driven flows already enforce. Admin still bypasses the per-id grant check but cannot bypass registration or project match. Pass 3 also seeds `dry_run_set` for resolved registered paths so the cost-cap dry-run runs against the same physical table the user named — composing cleanly with the Layer 2 fail-fast fallback.
This commit is contained in:
parent
c32be3fe96
commit
824e3cb636
2 changed files with 198 additions and 0 deletions
|
|
@ -564,6 +564,66 @@ def _bq_guardrail_inputs(
|
|||
seen_paths.add(key)
|
||||
dry_run.append((bucket, source_table, 0))
|
||||
|
||||
# 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
|
||||
# Pre-#201 these bypassed Agnes RBAC entirely — only the configured
|
||||
# service account scope limited which tables a user could reach. Gate
|
||||
# them identically to the `bq.<ds>.<tbl>` pass: must match the
|
||||
# configured data project, must point at a registered row, and the
|
||||
# caller must hold a grant on that row's id (admin bypasses the grant
|
||||
# check but still requires registration + project match).
|
||||
#
|
||||
# Lazy `get_bq_access()` import via the module-level alias so tests
|
||||
# can monkeypatch a fake. When BQ isn't configured (no data project),
|
||||
# fall through silently — full backtick paths can't possibly resolve
|
||||
# against this instance, so leave them to BQ to reject if a query
|
||||
# somehow makes it through.
|
||||
try:
|
||||
bq = get_bq_access()
|
||||
data_project = (bq.projects.data or "").strip()
|
||||
except Exception:
|
||||
data_project = ""
|
||||
|
||||
if data_project:
|
||||
for m in _BACKTICK_FULL_PATH.finditer(sql):
|
||||
proj, ds, tbl = m.group(1), m.group(2), m.group(3)
|
||||
if proj.lower() != data_project.lower():
|
||||
return [], [], {
|
||||
"reason": "bq_path_cross_project",
|
||||
"path": f"`{proj}.{ds}.{tbl}`",
|
||||
"expected_project": data_project,
|
||||
"hint": (
|
||||
"--remote queries can only reference tables in the "
|
||||
"configured BigQuery data project. Register "
|
||||
"cross-project tables via `agnes admin "
|
||||
"register-table` if needed."
|
||||
),
|
||||
}
|
||||
row = repo.find_by_bq_path(ds, tbl)
|
||||
if row is None:
|
||||
return [], [], {
|
||||
"reason": "bq_path_not_registered",
|
||||
"path": f"`{proj}.{ds}.{tbl}`",
|
||||
"hint": (
|
||||
"Direct BigQuery paths must point to a registered "
|
||||
"table. Register via `agnes admin register-table` "
|
||||
"or use the registered name from `agnes catalog`."
|
||||
),
|
||||
}
|
||||
if not is_admin:
|
||||
if accessible_set is None or row["id"] not in accessible_set:
|
||||
return [], [], {
|
||||
"reason": "bq_path_access_denied",
|
||||
"path": f"`{proj}.{ds}.{tbl}`",
|
||||
"registered_as": row["name"],
|
||||
}
|
||||
bucket = row["bucket"]
|
||||
source_table = row["source_table"]
|
||||
if bucket and source_table:
|
||||
key = (bucket.lower(), source_table.lower())
|
||||
if key not in seen_paths:
|
||||
seen_paths.add(key)
|
||||
dry_run.append((bucket, source_table, 0))
|
||||
|
||||
return dry_run, name_lookups, None
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -586,3 +586,141 @@ def test_guardrail_skips_bare_name_match_inside_backticks(
|
|||
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
|
||||
f"nested-backtick corruption signature present: {sent!r}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
|
||||
# Pre-fix these bypassed Agnes RBAC entirely — only the configured service
|
||||
# account scope limited which tables a user could reach. Post-fix, they're
|
||||
# treated identically to `bq."<dataset>"."<table>"` syntax.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
|
||||
"""Full backtick path to an unregistered `<dataset>.<table>` (project
|
||||
matches the configured data project) → HTTP 403 with
|
||||
`bq_path_not_registered`."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": (
|
||||
"SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
|
||||
"WHERE country = 'CZ'"
|
||||
),
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 403, r.json()
|
||||
detail = r.json().get("detail", {})
|
||||
assert isinstance(detail, dict), detail
|
||||
assert detail.get("reason") == "bq_path_not_registered", detail
|
||||
assert "secret_ds" in detail.get("path", ""), detail
|
||||
assert "secret_tbl" in detail.get("path", ""), detail
|
||||
|
||||
|
||||
def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
|
||||
"""Full backtick path with project ≠ configured data project → HTTP
|
||||
403 with `bq_path_cross_project`. Even if the path happens to point
|
||||
at a registered (bucket, source_table), the project mismatch is the
|
||||
primary boundary."""
|
||||
_register_bq_remote_row("ue", "finance", "ue")
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 403, r.json()
|
||||
detail = r.json().get("detail", {})
|
||||
assert isinstance(detail, dict), detail
|
||||
assert detail.get("reason") == "bq_path_cross_project", detail
|
||||
assert detail.get("expected_project") == "test-data-prj", detail
|
||||
assert "other-project" in detail.get("path", ""), detail
|
||||
|
||||
|
||||
def test_full_backtick_path_registered_admin_passes(
|
||||
seeded_app, mock_dry_run, monkeypatch,
|
||||
):
|
||||
"""Admin caller + registered path + matching project → no RBAC
|
||||
rejection. The dry-run fires (we can capture the SQL the guardrail
|
||||
forwards) and no `bq_path_*` reason appears in any error response."""
|
||||
_register_bq_remote_row("ue", "finance", "ue")
|
||||
|
||||
captured = {"sql": None}
|
||||
|
||||
def capturing_fake(_bq, sql):
|
||||
captured["sql"] = sql
|
||||
return 1024 # tiny — pass cap
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
# If 403, must NOT be the issue-#201 bq_path_* reasons.
|
||||
if r.status_code == 403:
|
||||
detail = r.json().get("detail", {})
|
||||
if isinstance(detail, dict):
|
||||
assert detail.get("reason") not in (
|
||||
"bq_path_not_registered",
|
||||
"bq_path_access_denied",
|
||||
"bq_path_cross_project",
|
||||
), f"admin + registered path should pass RBAC: {detail}"
|
||||
# The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
|
||||
# and the cap-guard fired. The user's WHERE clause must still be in
|
||||
# the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
|
||||
assert captured["sql"] is not None, (
|
||||
"dry-run never fired — Pass 3 may not have registered the path"
|
||||
)
|
||||
assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
|
||||
assert "WHERE id = 1" in captured["sql"], captured["sql"]
|
||||
|
||||
|
||||
def test_full_backtick_path_inside_string_literal_not_gated(
|
||||
seeded_app, mock_dry_run,
|
||||
):
|
||||
"""Defensive case: a backtick path appearing inside a SQL string
|
||||
literal (rare but possible) should not trigger Pass 3. Practically
|
||||
this is unreachable because backticks aren't typically valid inside
|
||||
BQ string literals — but the regex doesn't know that. We document
|
||||
that the gate applies to ALL backtick triples to be safe; users who
|
||||
really need a literal can use single-quoted strings without
|
||||
backticks."""
|
||||
# No registration; the test confirms an unregistered path inside
|
||||
# what looks like a string is still gated. This is the conservative
|
||||
# boundary — false-positive on string literal beats false-negative
|
||||
# on a real RBAC bypass.
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": (
|
||||
"SELECT 'matches `test-data-prj.x.y`' AS lit"
|
||||
),
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
# Either gated (403) or 200 if the analytics DB happens to evaluate
|
||||
# the literal — both are acceptable. The point is no silent RBAC
|
||||
# bypass: if the response is 200, no BQ table was reached.
|
||||
if r.status_code == 403:
|
||||
detail = r.json().get("detail", {})
|
||||
if isinstance(detail, dict):
|
||||
assert detail.get("reason") in (
|
||||
"bq_path_not_registered",
|
||||
"bq_path_cross_project",
|
||||
), detail
|
||||
|
|
|
|||
Loading…
Reference in a new issue