agnes-the-ai-analyst/tests/test_table_registry_find_by_bq_path.py
ZdenekSrotyr 91aaeb9194 feat(repo): #160 add find_by_bq_path lookup for direct bq.* RBAC enforcement
The upcoming /api/query RBAC patch (next phase) gates direct
`bq."<dataset>"."<source_table>"` references in user SQL — every such path
must point at a registered query_mode='remote' BigQuery row, otherwise the
caller has stepped around the registry and around RBAC.

Add `TableRegistryRepository.find_by_bq_path(bucket, source_table)` to
support that lookup. Returns None if no row matches, the row dict if
exactly one matches, or the oldest-by-`registered_at` row when 2+ match
(no UNIQUE constraint on `(source_type, bucket, source_table)` — admins
can in principle register a BQ table twice with different ids/names).

Match is case-insensitive on bucket+source_table so user SQL `SELECT FROM
bq.Finance.UE` resolves to a `(finance, ue)` registry row. NULL values in
either column are excluded so a legacy NULL-bucket row never masks a
legitimate non-NULL lookup.

5 RED tests cover: empty registry, non-BQ source rejected, single match,
oldest-of-many tie-breaker, case-insensitive match, NULL-column exclusion.
All initially failed with AttributeError; pass after the ~30 LOC method
addition.
2026-05-04 10:31:35 +02:00

125 lines
4.4 KiB
Python

"""Repository lookup of registry rows by their BigQuery dataset+source_table.
Used by /api/query's RBAC patch to gate direct `bq."<dataset>"."<source_table>"`
references — every such reference must point at a registered row, otherwise
the caller has bypassed the registry and bypassed RBAC.
Closes part of #160.
"""
import time
import duckdb
import pytest
from src.db import _ensure_schema
from src.repositories.table_registry import TableRegistryRepository
@pytest.fixture
def repo(tmp_path):
conn = duckdb.connect(str(tmp_path / "system.duckdb"))
_ensure_schema(conn)
return TableRegistryRepository(conn)
def test_find_returns_none_when_no_match(repo):
"""Empty registry → None for any path."""
result = repo.find_by_bq_path("finance", "unit_economics")
assert result is None
def test_find_returns_none_when_not_bigquery(repo):
"""A keboola row with the same bucket+source_table must NOT be returned —
find_by_bq_path is BQ-only by contract."""
repo.register(
id="kbc.in.c-finance.ue",
name="ue_kbc",
source_type="keboola",
bucket="in.c-finance",
source_table="ue",
query_mode="local",
)
# Even with the same path strings, this is a Keboola row — must not match.
assert repo.find_by_bq_path("in.c-finance", "ue") is None
def test_find_returns_single_match(repo):
"""One BQ row matching → return it as a dict."""
repo.register(
id="bq.finance.unit_economics",
name="unit_economics",
source_type="bigquery",
bucket="finance",
source_table="unit_economics",
query_mode="remote",
)
row = repo.find_by_bq_path("finance", "unit_economics")
assert row is not None
assert row["id"] == "bq.finance.unit_economics"
assert row["name"] == "unit_economics"
assert row["source_type"] == "bigquery"
def test_find_oldest_when_multiple_match(repo):
"""No unique constraint on (source_type, bucket, source_table). When 2+
rows match, return the oldest by `registered_at` so the result is
deterministic across calls."""
from datetime import datetime, timezone, timedelta
base = datetime(2026, 1, 1, tzinfo=timezone.utc)
repo.register(
id="bq.finance.ue.v1",
name="ue_v1",
source_type="bigquery",
bucket="finance",
source_table="ue",
query_mode="remote",
registered_at=base,
)
repo.register(
id="bq.finance.ue.v2",
name="ue_v2",
source_type="bigquery",
bucket="finance",
source_table="ue",
query_mode="remote",
registered_at=base + timedelta(days=30), # newer
)
row = repo.find_by_bq_path("finance", "ue")
assert row is not None
assert row["id"] == "bq.finance.ue.v1", \
f"expected oldest (ue_v1) to win; got {row['id']}"
def test_find_case_insensitive(repo):
"""BQ identifiers are case-preserving but DuckDB analytics views fold
unquoted identifiers to lowercase. The lookup must match regardless of
case so user SQL `SELECT FROM bq.Finance.UE` resolves to the registered
`(finance, unit_economics)` row."""
repo.register(
id="bq.finance.unit_economics",
name="unit_economics",
source_type="bigquery",
bucket="finance",
source_table="unit_economics",
query_mode="remote",
)
# User SQL might come through with any casing.
assert repo.find_by_bq_path("FINANCE", "UNIT_ECONOMICS") is not None
assert repo.find_by_bq_path("Finance", "Unit_Economics") is not None
assert repo.find_by_bq_path("finance", "unit_economics") is not None
def test_find_excludes_null_bucket_or_source_table(repo):
"""Local rows can have NULL bucket/source_table (e.g. some legacy
materialized rows). Defensive guard: NULL must never match a non-NULL
query, so the cross-RBAC check doesn't mismatch a NULL registry row."""
# Insert a BQ row with NULL bucket via direct SQL since register() defaults
# source_table to table_name.
repo.conn.execute(
"""INSERT INTO table_registry (id, name, source_type, bucket, source_table, query_mode, registered_at)
VALUES ('bq.weird', 'weird', 'bigquery', NULL, NULL, 'remote', current_timestamp)""",
)
# Looking up with a real bucket+source_table must NOT match the NULL row
# (regardless of what `lower(NULL)=lower('x')` evaluates to in DuckDB).
assert repo.find_by_bq_path("foo", "bar") is None