feat(query): registry-gate full backtick BigQuery paths (#201)
Adds Pass 3 to `_bq_guardrail_inputs` that scans user SQL for full backtick paths `<project>.<dataset>.<table>` and gates them identically to the `bq."<dataset>"."<table>"` pass: - Project must match the configured BigQuery data project (`get_bq_access().projects.data`). Mismatch → HTTP 403 `bq_path_cross_project`. - Path must point at a registered row. Unregistered → HTTP 403 `bq_path_not_registered`. - Non-admin caller must hold a grant on the registered row's id. Missing grant → HTTP 403 `bq_path_access_denied`. Pre-fix, full backtick paths bypassed Agnes RBAC entirely — only the service account scope limited reach. Post-fix the boundary matches what `agnes catalog`-driven flows already enforce. Admin still bypasses the per-id grant check but cannot bypass registration or project match. Pass 3 also seeds `dry_run_set` for resolved registered paths so the cost-cap dry-run runs against the same physical table the user named — composing cleanly with the Layer 2 fail-fast fallback.
This commit is contained in:
parent
c32be3fe96
commit
824e3cb636
2 changed files with 198 additions and 0 deletions
|
|
@ -564,6 +564,66 @@ def _bq_guardrail_inputs(
|
||||||
seen_paths.add(key)
|
seen_paths.add(key)
|
||||||
dry_run.append((bucket, source_table, 0))
|
dry_run.append((bucket, source_table, 0))
|
||||||
|
|
||||||
|
# 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
|
||||||
|
# Pre-#201 these bypassed Agnes RBAC entirely — only the configured
|
||||||
|
# service account scope limited which tables a user could reach. Gate
|
||||||
|
# them identically to the `bq.<ds>.<tbl>` pass: must match the
|
||||||
|
# configured data project, must point at a registered row, and the
|
||||||
|
# caller must hold a grant on that row's id (admin bypasses the grant
|
||||||
|
# check but still requires registration + project match).
|
||||||
|
#
|
||||||
|
# Lazy `get_bq_access()` import via the module-level alias so tests
|
||||||
|
# can monkeypatch a fake. When BQ isn't configured (no data project),
|
||||||
|
# fall through silently — full backtick paths can't possibly resolve
|
||||||
|
# against this instance, so leave them to BQ to reject if a query
|
||||||
|
# somehow makes it through.
|
||||||
|
try:
|
||||||
|
bq = get_bq_access()
|
||||||
|
data_project = (bq.projects.data or "").strip()
|
||||||
|
except Exception:
|
||||||
|
data_project = ""
|
||||||
|
|
||||||
|
if data_project:
|
||||||
|
for m in _BACKTICK_FULL_PATH.finditer(sql):
|
||||||
|
proj, ds, tbl = m.group(1), m.group(2), m.group(3)
|
||||||
|
if proj.lower() != data_project.lower():
|
||||||
|
return [], [], {
|
||||||
|
"reason": "bq_path_cross_project",
|
||||||
|
"path": f"`{proj}.{ds}.{tbl}`",
|
||||||
|
"expected_project": data_project,
|
||||||
|
"hint": (
|
||||||
|
"--remote queries can only reference tables in the "
|
||||||
|
"configured BigQuery data project. Register "
|
||||||
|
"cross-project tables via `agnes admin "
|
||||||
|
"register-table` if needed."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
row = repo.find_by_bq_path(ds, tbl)
|
||||||
|
if row is None:
|
||||||
|
return [], [], {
|
||||||
|
"reason": "bq_path_not_registered",
|
||||||
|
"path": f"`{proj}.{ds}.{tbl}`",
|
||||||
|
"hint": (
|
||||||
|
"Direct BigQuery paths must point to a registered "
|
||||||
|
"table. Register via `agnes admin register-table` "
|
||||||
|
"or use the registered name from `agnes catalog`."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
if not is_admin:
|
||||||
|
if accessible_set is None or row["id"] not in accessible_set:
|
||||||
|
return [], [], {
|
||||||
|
"reason": "bq_path_access_denied",
|
||||||
|
"path": f"`{proj}.{ds}.{tbl}`",
|
||||||
|
"registered_as": row["name"],
|
||||||
|
}
|
||||||
|
bucket = row["bucket"]
|
||||||
|
source_table = row["source_table"]
|
||||||
|
if bucket and source_table:
|
||||||
|
key = (bucket.lower(), source_table.lower())
|
||||||
|
if key not in seen_paths:
|
||||||
|
seen_paths.add(key)
|
||||||
|
dry_run.append((bucket, source_table, 0))
|
||||||
|
|
||||||
return dry_run, name_lookups, None
|
return dry_run, name_lookups, None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -586,3 +586,141 @@ def test_guardrail_skips_bare_name_match_inside_backticks(
|
||||||
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
|
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
|
||||||
f"nested-backtick corruption signature present: {sent!r}"
|
f"nested-backtick corruption signature present: {sent!r}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
|
||||||
|
# Pre-fix these bypassed Agnes RBAC entirely — only the configured service
|
||||||
|
# account scope limited which tables a user could reach. Post-fix, they're
|
||||||
|
# treated identically to `bq."<dataset>"."<table>"` syntax.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
|
||||||
|
"""Full backtick path to an unregistered `<dataset>.<table>` (project
|
||||||
|
matches the configured data project) → HTTP 403 with
|
||||||
|
`bq_path_not_registered`."""
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": (
|
||||||
|
"SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
|
||||||
|
"WHERE country = 'CZ'"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
assert r.status_code == 403, r.json()
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
assert isinstance(detail, dict), detail
|
||||||
|
assert detail.get("reason") == "bq_path_not_registered", detail
|
||||||
|
assert "secret_ds" in detail.get("path", ""), detail
|
||||||
|
assert "secret_tbl" in detail.get("path", ""), detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
|
||||||
|
"""Full backtick path with project ≠ configured data project → HTTP
|
||||||
|
403 with `bq_path_cross_project`. Even if the path happens to point
|
||||||
|
at a registered (bucket, source_table), the project mismatch is the
|
||||||
|
primary boundary."""
|
||||||
|
_register_bq_remote_row("ue", "finance", "ue")
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
assert r.status_code == 403, r.json()
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
assert isinstance(detail, dict), detail
|
||||||
|
assert detail.get("reason") == "bq_path_cross_project", detail
|
||||||
|
assert detail.get("expected_project") == "test-data-prj", detail
|
||||||
|
assert "other-project" in detail.get("path", ""), detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_registered_admin_passes(
|
||||||
|
seeded_app, mock_dry_run, monkeypatch,
|
||||||
|
):
|
||||||
|
"""Admin caller + registered path + matching project → no RBAC
|
||||||
|
rejection. The dry-run fires (we can capture the SQL the guardrail
|
||||||
|
forwards) and no `bq_path_*` reason appears in any error response."""
|
||||||
|
_register_bq_remote_row("ue", "finance", "ue")
|
||||||
|
|
||||||
|
captured = {"sql": None}
|
||||||
|
|
||||||
|
def capturing_fake(_bq, sql):
|
||||||
|
captured["sql"] = sql
|
||||||
|
return 1024 # tiny — pass cap
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
# If 403, must NOT be the issue-#201 bq_path_* reasons.
|
||||||
|
if r.status_code == 403:
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
if isinstance(detail, dict):
|
||||||
|
assert detail.get("reason") not in (
|
||||||
|
"bq_path_not_registered",
|
||||||
|
"bq_path_access_denied",
|
||||||
|
"bq_path_cross_project",
|
||||||
|
), f"admin + registered path should pass RBAC: {detail}"
|
||||||
|
# The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
|
||||||
|
# and the cap-guard fired. The user's WHERE clause must still be in
|
||||||
|
# the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
|
||||||
|
assert captured["sql"] is not None, (
|
||||||
|
"dry-run never fired — Pass 3 may not have registered the path"
|
||||||
|
)
|
||||||
|
assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
|
||||||
|
assert "WHERE id = 1" in captured["sql"], captured["sql"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_inside_string_literal_not_gated(
|
||||||
|
seeded_app, mock_dry_run,
|
||||||
|
):
|
||||||
|
"""Defensive case: a backtick path appearing inside a SQL string
|
||||||
|
literal (rare but possible) should not trigger Pass 3. Practically
|
||||||
|
this is unreachable because backticks aren't typically valid inside
|
||||||
|
BQ string literals — but the regex doesn't know that. We document
|
||||||
|
that the gate applies to ALL backtick triples to be safe; users who
|
||||||
|
really need a literal can use single-quoted strings without
|
||||||
|
backticks."""
|
||||||
|
# No registration; the test confirms an unregistered path inside
|
||||||
|
# what looks like a string is still gated. This is the conservative
|
||||||
|
# boundary — false-positive on string literal beats false-negative
|
||||||
|
# on a real RBAC bypass.
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": (
|
||||||
|
"SELECT 'matches `test-data-prj.x.y`' AS lit"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
# Either gated (403) or 200 if the analytics DB happens to evaluate
|
||||||
|
# the literal — both are acceptable. The point is no silent RBAC
|
||||||
|
# bypass: if the response is 200, no BQ table was reached.
|
||||||
|
if r.status_code == 403:
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
if isinstance(detail, dict):
|
||||||
|
assert detail.get("reason") in (
|
||||||
|
"bq_path_not_registered",
|
||||||
|
"bq_path_cross_project",
|
||||||
|
), detail
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue