From 824e3cb636136a45174f98e33a02a1540db16ae2 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Wed, 6 May 2026 17:54:07 +0200 Subject: [PATCH] feat(query): registry-gate full backtick BigQuery paths (#201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Pass 3 to `_bq_guardrail_inputs` that scans user SQL for full backtick paths `..` and gates them identically to the `bq.""."
"` pass: - Project must match the configured BigQuery data project (`get_bq_access().projects.data`). Mismatch → HTTP 403 `bq_path_cross_project`. - Path must point at a registered row. Unregistered → HTTP 403 `bq_path_not_registered`. - Non-admin caller must hold a grant on the registered row's id. Missing grant → HTTP 403 `bq_path_access_denied`. Pre-fix, full backtick paths bypassed Agnes RBAC entirely — only the service account scope limited reach. Post-fix the boundary matches what `agnes catalog`-driven flows already enforce. Admin still bypasses the per-id grant check but cannot bypass registration or project match. Pass 3 also seeds `dry_run_set` for resolved registered paths so the cost-cap dry-run runs against the same physical table the user named — composing cleanly with the Layer 2 fail-fast fallback. --- app/api/query.py | 60 +++++++++++++ tests/test_api_query_guardrail.py | 138 ++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) diff --git a/app/api/query.py b/app/api/query.py index 0da45d4..3a751a8 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -564,6 +564,66 @@ def _bq_guardrail_inputs( seen_paths.add(key) dry_run.append((bucket, source_table, 0)) + # 3. Full backtick path `..
` pass (issue #201). + # Pre-#201 these bypassed Agnes RBAC entirely — only the configured + # service account scope limited which tables a user could reach. Gate + # them identically to the `bq..` pass: must match the + # configured data project, must point at a registered row, and the + # caller must hold a grant on that row's id (admin bypasses the grant + # check but still requires registration + project match). + # + # Lazy `get_bq_access()` import via the module-level alias so tests + # can monkeypatch a fake. When BQ isn't configured (no data project), + # fall through silently — full backtick paths can't possibly resolve + # against this instance, so leave them to BQ to reject if a query + # somehow makes it through. + try: + bq = get_bq_access() + data_project = (bq.projects.data or "").strip() + except Exception: + data_project = "" + + if data_project: + for m in _BACKTICK_FULL_PATH.finditer(sql): + proj, ds, tbl = m.group(1), m.group(2), m.group(3) + if proj.lower() != data_project.lower(): + return [], [], { + "reason": "bq_path_cross_project", + "path": f"`{proj}.{ds}.{tbl}`", + "expected_project": data_project, + "hint": ( + "--remote queries can only reference tables in the " + "configured BigQuery data project. Register " + "cross-project tables via `agnes admin " + "register-table` if needed." + ), + } + row = repo.find_by_bq_path(ds, tbl) + if row is None: + return [], [], { + "reason": "bq_path_not_registered", + "path": f"`{proj}.{ds}.{tbl}`", + "hint": ( + "Direct BigQuery paths must point to a registered " + "table. Register via `agnes admin register-table` " + "or use the registered name from `agnes catalog`." + ), + } + if not is_admin: + if accessible_set is None or row["id"] not in accessible_set: + return [], [], { + "reason": "bq_path_access_denied", + "path": f"`{proj}.{ds}.{tbl}`", + "registered_as": row["name"], + } + bucket = row["bucket"] + source_table = row["source_table"] + if bucket and source_table: + key = (bucket.lower(), source_table.lower()) + if key not in seen_paths: + seen_paths.add(key) + dry_run.append((bucket, source_table, 0)) + return dry_run, name_lookups, None diff --git a/tests/test_api_query_guardrail.py b/tests/test_api_query_guardrail.py index 05f8143..67e09dc 100644 --- a/tests/test_api_query_guardrail.py +++ b/tests/test_api_query_guardrail.py @@ -586,3 +586,141 @@ def test_guardrail_skips_bare_name_match_inside_backticks( assert "`test-data-prj.finance.`test-data-prj" not in sent, ( f"nested-backtick corruption signature present: {sent!r}" ) + + +# --------------------------------------------------------------------------- +# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated. +# Pre-fix these bypassed Agnes RBAC entirely — only the configured service +# account scope limited which tables a user could reach. Post-fix, they're +# treated identically to `bq.""."
"` syntax. +# --------------------------------------------------------------------------- + + +def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run): + """Full backtick path to an unregistered `.
` (project + matches the configured data project) → HTTP 403 with + `bq_path_not_registered`.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": ( + "SELECT * FROM `test-data-prj.secret_ds.secret_tbl` " + "WHERE country = 'CZ'" + ), + }, + headers=_auth(token), + ) + assert r.status_code == 403, r.json() + detail = r.json().get("detail", {}) + assert isinstance(detail, dict), detail + assert detail.get("reason") == "bq_path_not_registered", detail + assert "secret_ds" in detail.get("path", ""), detail + assert "secret_tbl" in detail.get("path", ""), detail + + +def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run): + """Full backtick path with project ≠ configured data project → HTTP + 403 with `bq_path_cross_project`. Even if the path happens to point + at a registered (bucket, source_table), the project mismatch is the + primary boundary.""" + _register_bq_remote_row("ue", "finance", "ue") + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1", + }, + headers=_auth(token), + ) + assert r.status_code == 403, r.json() + detail = r.json().get("detail", {}) + assert isinstance(detail, dict), detail + assert detail.get("reason") == "bq_path_cross_project", detail + assert detail.get("expected_project") == "test-data-prj", detail + assert "other-project" in detail.get("path", ""), detail + + +def test_full_backtick_path_registered_admin_passes( + seeded_app, mock_dry_run, monkeypatch, +): + """Admin caller + registered path + matching project → no RBAC + rejection. The dry-run fires (we can capture the SQL the guardrail + forwards) and no `bq_path_*` reason appears in any error response.""" + _register_bq_remote_row("ue", "finance", "ue") + + captured = {"sql": None} + + def capturing_fake(_bq, sql): + captured["sql"] = sql + return 1024 # tiny — pass cap + + monkeypatch.setattr( + "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1", + }, + headers=_auth(token), + ) + # If 403, must NOT be the issue-#201 bq_path_* reasons. + if r.status_code == 403: + detail = r.json().get("detail", {}) + if isinstance(detail, dict): + assert detail.get("reason") not in ( + "bq_path_not_registered", + "bq_path_access_denied", + "bq_path_cross_project", + ), f"admin + registered path should pass RBAC: {detail}" + # The dry-run was invoked, meaning Pass 3 added the path to dry_run_set + # and the cap-guard fired. The user's WHERE clause must still be in + # the dry-run SQL (validates Layer 1 — backtick-aware rewrite). + assert captured["sql"] is not None, ( + "dry-run never fired — Pass 3 may not have registered the path" + ) + assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"] + assert "WHERE id = 1" in captured["sql"], captured["sql"] + + +def test_full_backtick_path_inside_string_literal_not_gated( + seeded_app, mock_dry_run, +): + """Defensive case: a backtick path appearing inside a SQL string + literal (rare but possible) should not trigger Pass 3. Practically + this is unreachable because backticks aren't typically valid inside + BQ string literals — but the regex doesn't know that. We document + that the gate applies to ALL backtick triples to be safe; users who + really need a literal can use single-quoted strings without + backticks.""" + # No registration; the test confirms an unregistered path inside + # what looks like a string is still gated. This is the conservative + # boundary — false-positive on string literal beats false-negative + # on a real RBAC bypass. + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": ( + "SELECT 'matches `test-data-prj.x.y`' AS lit" + ), + }, + headers=_auth(token), + ) + # Either gated (403) or 200 if the analytics DB happens to evaluate + # the literal — both are acceptable. The point is no silent RBAC + # bypass: if the response is 200, no BQ table was reached. + if r.status_code == 403: + detail = r.json().get("detail", {}) + if isinstance(detail, dict): + assert detail.get("reason") in ( + "bq_path_not_registered", + "bq_path_cross_project", + ), detail