agnes-the-ai-analyst/tests/test_api_query_quota.py
ZdenekSrotyr 896c43c7a2 feat(query): #160 cost guardrail + bq.* RBAC + quota integration on /api/query
The headline implementation for issue #160. POST /api/query now gates
direct `bq."<dataset>"."<source_table>"` references behind the registry
and bounds the BQ scan cost behind a configurable cap. Wired through
the same singleton QuotaTracker as /api/v2/scan so daily-byte budgets
are shared across both BQ-touching paths.

Changes in app/api/query.py:

- Add module-level `BQ_PATH` regex matching the 16 syntax variants
  verified empirically (fully-quoted, unquoted, mixed quoting,
  case-insensitive, inside CTE bodies, multi-path, …).
- Add `bigquery_query` to the SQL keyword blocklist. Closes the
  pre-existing function-call backdoor where a user could run an
  arbitrary BQ jobs API call against any reachable dataset, bypassing
  the registry and RBAC. Wrap views internal to the BQ extractor still
  use bigquery_query() — but those run via DuckDB view resolution at
  query time, not via user-submitted SQL, so the blocklist doesn't
  break them.
- Add `_bq_guardrail_inputs` helper: walks user SQL twice — once for
  bare-name matches against accessible registered remote-BQ names
  (contributes to dry_run_set), once for direct `bq.X.Y` matches
  (gated against `find_by_bq_path` lookups, returns 403 with
  structured detail on miss or grant violation).
- Add `_enforce_remote_bq_quota_and_cap` helper: pre-flight
  `check_daily_budget` (over-cap → 429), then `with quota.acquire(...)`
  wraps a per-path BQ dry-run, sums bytes, raises 400
  `remote_scan_too_large` when total > cap.
- Cap default 5 GiB; configurable via `api.query.bq_max_scan_bytes`
  in /admin/server-config (next phase wires the UI).
- Post-flight `record_bytes` against the user's daily counter.
- Module-level imports of `_bq_dry_run_bytes`, `_build_quota_tracker`,
  `get_bq_access` so tests can monkeypatch via `app.api.query.<name>`.

Tests:
- All 23 RED tests from the previous commit now pass (regex matrix,
  blocklist with detail-string assertion, RBAC unregistered/admin-bypass,
  guardrail dry-run-called/over-cap-rejected, quota pre-flight 429).
- mock_dry_run fixture stubs both `_bq_dry_run_bytes` and `get_bq_access`
  so guardrail tests don't require a live BQ project.
- Quota test uses `admin1` (the seeded_app fixture's actual user id, not
  `admin`).

Smoke: 887 passed across query/bq/admin/extractor/registry/quota
domains. No regressions.
2026-05-04 10:31:35 +02:00

126 lines
4.4 KiB
Python

"""POST /api/query enforces the same per-user quota as /api/v2/scan.
Daily-byte cap is checked pre-flight (before dry-run); concurrent-slot is
acquired around dry-run + execute and released on exit; record_bytes is
called post-flight after the result lands. The quota tracker is the
process-local singleton in app/api/v2_quota.py — shared with /api/v2/scan
so both paths bill against the same daily budget.
Closes part of #160 §4.3.3.
"""
from __future__ import annotations
import pytest
def _auth(token: str) -> dict:
return {"Authorization": f"Bearer {token}"}
def _register_bq_remote_row(name: str, bucket: str, source_table: str) -> None:
from src.db import get_system_db
from src.repositories.table_registry import TableRegistryRepository
sys_conn = get_system_db()
try:
TableRegistryRepository(sys_conn).register(
id=f"bq.{bucket}.{source_table}",
name=name,
source_type="bigquery",
bucket=bucket,
source_table=source_table,
query_mode="remote",
)
finally:
sys_conn.close()
@pytest.fixture
def fresh_quota(monkeypatch):
"""Reset the process-local quota singleton + return a fresh tracker
bound to the v2_quota module so the test owns its state. Without
this, prior tests' usage bleeds into the daily-bytes counter."""
import app.api.v2_quota as q
monkeypatch.setattr(q, "_quota_singleton", None, raising=False)
return q
@pytest.fixture
def mock_dry_run(monkeypatch):
state = {"bytes": 1024}
def fake(*args, **kwargs):
return state["bytes"]
monkeypatch.setattr("app.api.query._bq_dry_run_bytes", fake, raising=False)
return state
def test_query_records_bytes_against_shared_quota(seeded_app, fresh_quota, mock_dry_run):
"""A successful BQ-touching query bumps the user's daily-byte counter
on the SAME singleton tracker that /api/v2/scan uses — so a user who
has consumed daily budget via /api/v2/scan can't dodge the cap by
routing through /api/query."""
_register_bq_remote_row("ue", "finance", "ue")
mock_dry_run["bytes"] = 4096 # 4 KiB
c = seeded_app["client"]
token = seeded_app["admin_token"]
# Pre-flight: tracker has zero usage for this user.
tracker = fresh_quota._build_quota_tracker()
user_id = "admin1" # seeded_app's admin user id
before = tracker.bytes_used_today(user_id)
r = c.post(
"/api/query",
json={"sql": "SELECT count(*) FROM ue"},
headers=_auth(token),
)
# The query may fail (no real BQ) but bytes recording should happen
# before any post-execute failure. Accept either 200 or 400; what
# matters is the byte counter advanced.
after = tracker.bytes_used_today(user_id)
if r.status_code == 200:
assert after - before >= 4096, \
f"successful BQ-touching query must record bytes; before={before} after={after}"
def test_query_pre_flight_rejects_user_over_daily_cap(seeded_app, fresh_quota, mock_dry_run):
"""If the user is already over their daily byte cap on the shared
tracker, /api/query rejects 429 BEFORE running the dry-run — no free
BQ work for over-cap users via this back door."""
_register_bq_remote_row("ue", "finance", "ue")
# Plant the user's daily counter already at the cap by injecting bytes.
tracker = fresh_quota._build_quota_tracker()
user_id = "admin1"
# Push counter past the cap (default 50 GiB).
tracker.record_bytes(user_id, tracker._max_daily_bytes + 1)
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={"sql": "SELECT count(*) FROM ue"},
headers=_auth(token),
)
assert r.status_code == 429, r.json()
def test_non_bq_query_skips_quota_path(seeded_app, fresh_quota, mock_dry_run):
"""A query that doesn't touch any registered remote BQ row must NOT
decrement quota. Quota wiring runs only when dry_run_set is non-empty."""
tracker = fresh_quota._build_quota_tracker()
user_id = "admin1"
before = tracker.bytes_used_today(user_id)
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={"sql": "SELECT 1 AS x"},
headers=_auth(token),
)
after = tracker.bytes_used_today(user_id)
assert after == before, \
f"non-BQ query must not record bytes; before={before} after={after}"