"""POST /api/query cost guardrail for query_mode='remote' BigQuery rows. When user SQL references a registered remote-BQ name (or a direct `bq."".""` path), run a BQ dry-run before execute. If the estimated scan exceeds the configured cap, reject with 400 + `remote_scan_too_large` so the operator pivots to `da fetch`. Default cap: 5 GiB per request. Configurable via `api.query.bq_max_scan_bytes` in /admin/server-config (#160 ยง4.4). """ from __future__ import annotations import pytest def _auth(token: str) -> dict: return {"Authorization": f"Bearer {token}"} def _register_bq_remote_row(name: str, bucket: str, source_table: str) -> None: from src.db import get_system_db from src.repositories.table_registry import TableRegistryRepository sys_conn = get_system_db() try: TableRegistryRepository(sys_conn).register( id=f"bq.{bucket}.{source_table}", name=name, source_type="bigquery", bucket=bucket, source_table=source_table, query_mode="remote", ) finally: sys_conn.close() @pytest.fixture def mock_dry_run(monkeypatch): """Replace `_bq_dry_run_bytes` with a controllable stub. Each test sets `mock_dry_run.bytes_to_return` to control what /api/query sees.""" state = {"bytes": 0} def fake(*args, **kwargs): return state["bytes"] monkeypatch.setattr("app.api.query._bq_dry_run_bytes", fake, raising=False) return state def test_query_under_cap_calls_dry_run(seeded_app, mock_dry_run, monkeypatch): """Dry-run is invoked when SQL references a registered remote BQ row. Use a sentinel side-effect to confirm: the mock records call counts.""" _register_bq_remote_row("ue", "finance", "ue") state = mock_dry_run state["bytes"] = 1 * 1024 * 1024 # 1 MiB state["call_count"] = 0 def counting_fake(*args, **kwargs): state["call_count"] += 1 return state["bytes"] monkeypatch.setattr("app.api.query._bq_dry_run_bytes", counting_fake, raising=False) c = seeded_app["client"] token = seeded_app["admin_token"] c.post( "/api/query", json={"sql": "SELECT count(*) FROM ue"}, headers=_auth(token), ) assert state["call_count"] >= 1, \ "guardrail must invoke _bq_dry_run_bytes when SQL references a registered remote BQ row" def test_query_over_cap_rejected_400(seeded_app, mock_dry_run, monkeypatch): """Dry-run reports 10 GiB; default cap (5 GiB) is exceeded โ†’ 400 with structured detail naming bytes + tables + suggestion.""" _register_bq_remote_row("ue", "finance", "ue") mock_dry_run["bytes"] = 10 * 1024 * 1024 * 1024 # 10 GiB c = seeded_app["client"] token = seeded_app["admin_token"] r = c.post( "/api/query", json={"sql": "SELECT * FROM ue"}, headers=_auth(token), ) assert r.status_code == 400, r.json() detail = r.json().get("detail", {}) if isinstance(detail, dict): assert detail.get("reason") == "remote_scan_too_large", detail assert detail.get("scan_bytes") >= 10 * 1024 * 1024 * 1024 assert "da fetch" in detail.get("suggestion", "").lower() or \ "fetch" in detail.get("suggestion", "").lower() assert "ue" in detail.get("tables", []) or \ any("ue" in t for t in detail.get("tables", [])) def test_no_bq_row_reference_skips_dry_run(seeded_app, monkeypatch): """A query that doesn't touch any registered BQ remote row must NOT invoke `_bq_dry_run_bytes` โ€” guardrail incurs zero new latency on plain non-BQ queries.""" state = {"calls": 0} def counting_fake(*args, **kwargs): state["calls"] += 1 return 100 * 1024 * 1024 * 1024 # 100 GiB โ€” irrelevant if not called monkeypatch.setattr("app.api.query._bq_dry_run_bytes", counting_fake, raising=False) c = seeded_app["client"] token = seeded_app["admin_token"] c.post( "/api/query", json={"sql": "SELECT 1 AS x"}, headers=_auth(token), ) assert state["calls"] == 0, \ f"guardrail must skip dry-run on non-BQ queries; got {state['calls']} calls"