agnes-the-ai-analyst/tests/test_v2_catalog_remote_metadata.py
ZdenekSrotyr b6cdd68e8d feat(catalog): entity_type + validated where_examples + view-aware cost-guard + scheduler hygiene
Three behavioural improvements driven by the sub-agent end-to-end test
findings, plus scheduler tweaks to prevent the post-deploy contention
burst we measured.

CATALOG (catalog-side bugs the test agents tripped on):
  - new entity_type field per remote row (BASE TABLE / VIEW /
    MATERIALIZED VIEW). For views, rows + size_bytes return null
    instead of the misleading 0 that __TABLES__ reports.
  - where_examples now validates against the table's actual schema
    (cached known_columns from refresh). The pre-fix behavior
    blindly advertised `country_code = 'CZ'` on tables with no
    country_code column — the sub-agent tests reliably hit this on
    unit_economics.
  - new known_columns + entity_type columns on bq_metadata_cache;
    populated by bq_metadata_refresh.refresh_one from the same
    fetch_bq_columns_full call (no extra BQ roundtrip) plus a
    cheap INFORMATION_SCHEMA.TABLES lookup for table_type.

QUERY COST-GUARD:
  - remote_scan_too_large suggestion now names views explicitly:
    `Target(s) <ids> are VIEW or MATERIALIZED VIEW. BigQuery does
    not push LIMIT into the view body — SELECT * FROM <view>
    LIMIT 1 still runs the full underlying scan.` Programmatic
    consumers get a new view_targets field on the error detail.

SCHEDULER HYGIENE (the post-deploy 1-minute window where
concurrent parquet downloads dropped to ~1 MB/s):
  - SCHEDULER_STARTUP_GRACE_SECONDS (default 60) holds the first
    tick so the burst doesn't overlap cache_warmup writes.
  - SCHEDULER_BQ_METADATA_INITIAL_OFFSET_MAX_SECONDS (default 900)
    randomises bq-metadata-refresh's first-fire offset.

TESTS:
  - test_bq_metadata_cache_repo: entity_type + known_columns round-trip
  - test_v2_catalog_remote_metadata: where_examples validation, views
    return null rows/size_bytes, cold rows have empty examples
  - test_api_query_guardrail: VIEW-aware suggestion text + view_targets
  - test_connectors_bigquery_metadata: entity_type lookup mock + new
    fields in TableMetadata expectations
  - test_scheduler_sidecar: grace + jitter env-var resolution
2026-05-12 10:37:35 +02:00

293 lines
10 KiB
Python

"""Catalog endpoint integration: per-table metadata enrichment for
remote rows.
Post-0.50 the catalog endpoint reads enrichment fields exclusively from
the persistent ``bq_metadata_cache`` table (populated by the scheduler-
driven refresh in ``app/api/bq_metadata_refresh.py``). These tests
pre-seed cache rows and verify the catalog response shape; they do NOT
mock ``connectors.bigquery.metadata.fetch`` because that path is no
longer reachable from the catalog request.
"""
from unittest.mock import patch
def _register_table(seeded_app, **kwargs):
"""Register a table into the test DB using TableRegistryRepository."""
from src.db import get_system_db
from src.repositories.table_registry import TableRegistryRepository
conn = get_system_db()
try:
repo = TableRegistryRepository(conn)
name = kwargs.pop("name", kwargs.get("id"))
repo.register(name=name, **kwargs)
finally:
conn.close()
def _seed_cache_row(
table_id: str,
*,
rows=None,
size_bytes=None,
partition_by=None,
clustered_by=None,
entity_type=None,
known_columns=None,
):
"""Insert a successful refresh row into bq_metadata_cache."""
from src.db import get_system_db
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
conn = get_system_db()
try:
BqMetadataCacheRepository(conn).upsert_success(
table_id,
rows=rows,
size_bytes=size_bytes,
partition_by=partition_by,
clustered_by=clustered_by,
entity_type=entity_type,
known_columns=known_columns,
)
finally:
conn.close()
def _reset_catalog_caches():
from app.api import v2_catalog
v2_catalog._table_rows_cache.clear()
def test_remote_row_includes_metadata_fields(seeded_app):
"""Catalog response for a query_mode='remote' BQ row carries the four
enrichment fields read from the persistent cache."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="orders", source_type="bigquery", bucket="dwh_base",
source_table="orders_2024", query_mode="remote",
)
_seed_cache_row(
"orders",
rows=10000, size_bytes=2_000_000,
partition_by="event_date", clustered_by=["country", "platform"],
entity_type="BASE TABLE",
known_columns=["event_date", "country", "platform", "amount"],
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
tables = r.json()["tables"]
orders = next(t for t in tables if t["id"] == "orders")
assert orders["rows"] == 10000
assert orders["size_bytes"] == 2_000_000
assert orders["partition_by"] == "event_date"
assert orders["clustered_by"] == ["country", "platform"]
assert orders["query_mode"] == "remote"
assert orders["metadata_freshness"] == "fresh"
assert orders["entity_type"] == "BASE TABLE"
# Both example templates apply: event_date present, country+platform present
assert "event_date > DATE '2026-01-01'" in orders["where_examples"]
assert "country_code = 'CZ' AND platform = 'web'" not in orders["where_examples"]
def test_where_examples_filtered_against_real_columns(seeded_app):
"""Generic where_examples that reference columns the table doesn't
have must be dropped (the pre-fix bug the test suite is designed to
catch). unit_economics-style table has event_date but no country_code."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="ue_like", source_type="bigquery", bucket="dwh_base",
source_table="unit_economics", query_mode="remote",
)
_seed_cache_row(
"ue_like",
rows=None, size_bytes=None,
partition_by="event_date", clustered_by=[],
entity_type="VIEW",
# Real schema: event_date present, country_code absent.
known_columns=["event_date", "order_event_id", "merchant_country"],
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
row = next(t for t in r.json()["tables"] if t["id"] == "ue_like")
# event_date example passes (column exists).
assert "event_date > DATE '2026-01-01'" in row["where_examples"]
# country_code/platform example dropped (columns missing).
assert all("country_code" not in e for e in row["where_examples"])
def test_view_returns_null_rows_and_size_bytes(seeded_app):
"""For a VIEW we keep rows/size_bytes as null even if the cache row
has them populated — pre-existing cache rows from before the
entity_type field existed will fix themselves on next refresh."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="ue_view", source_type="bigquery", bucket="dwh_base",
source_table="ue_view", query_mode="remote",
)
# Provider would have set rows/size_bytes to None for views; we mirror
# that contract here in the cache row.
_seed_cache_row(
"ue_view", rows=None, size_bytes=None,
partition_by=None, clustered_by=[],
entity_type="VIEW",
known_columns=["event_date"],
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
row = next(t for t in r.json()["tables"] if t["id"] == "ue_view")
assert row["entity_type"] == "VIEW"
assert row["rows"] is None
assert row["size_bytes"] is None
assert row["rough_size_hint"] is None
def test_where_examples_empty_when_columns_unknown(seeded_app):
"""For a remote row with no cache entry yet (never_fetched), don't
advertise any where_examples — we can't validate them against an
unknown schema."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="unfetched", source_type="bigquery", bucket="dwh_base",
source_table="unfetched", query_mode="remote",
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
row = next(t for t in r.json()["tables"] if t["id"] == "unfetched")
assert row["metadata_freshness"] == "never_fetched"
assert row["where_examples"] == []
assert row["entity_type"] is None
def test_remote_row_with_no_cache_returns_null_fields(seeded_app):
"""Catalog response for a remote row with no cache entry — first boot
before scheduler tick — returns null enrichment fields and
metadata_freshness='never_fetched'. MUST stay 200; MUST NOT call BQ."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="cold_t", source_type="bigquery", bucket="dwh_base",
source_table="cold_t", query_mode="remote",
)
# Patch the BQ provider so we can prove the request path never reaches it.
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
mock_fetch.assert_not_called()
tables = r.json()["tables"]
cold = next(t for t in tables if t["id"] == "cold_t")
assert cold["rows"] is None
assert cold["size_bytes"] is None
assert cold["partition_by"] is None
assert cold["clustered_by"] == []
assert cold["metadata_freshness"] == "never_fetched"
def test_local_row_metadata_freshness_is_not_applicable(seeded_app):
"""query_mode='local' rows take the parquet-stat path; the freshness
field signals that the BQ cache concept doesn't apply."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="users", source_type="keboola", bucket="in.c-crm",
source_table="users", query_mode="local",
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
tables = r.json()["tables"]
users = next(t for t in tables if t["id"] == "users")
assert users["metadata_freshness"] == "not_applicable"
def test_zero_size_bytes_reports_small_not_unknown(seeded_app):
"""Devin Review #1 regression preserved across the refactor: a cache
row with size_bytes=0 must surface rough_size_hint='small', not None.
"""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="empty_t", source_type="bigquery", bucket="dwh_base",
source_table="empty_t", query_mode="remote",
)
_seed_cache_row(
"empty_t", rows=0, size_bytes=0, clustered_by=[],
entity_type="BASE TABLE", known_columns=["event_date"],
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
tables = r.json()["tables"]
empty = next(t for t in tables if t["id"] == "empty_t")
assert empty["size_bytes"] == 0
assert empty["rough_size_hint"] == "small"
def test_catalog_request_never_calls_bq(seeded_app):
"""The whole point of the refactor: even with a cold cache and a
remote BQ row in the registry, GET /api/v2/catalog MUST NOT touch
the BQ provider. Regressing this re-introduces the >90 s hang."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="orders", source_type="bigquery", bucket="dwh_base",
source_table="orders_2024", query_mode="remote",
)
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
mock_fetch.assert_not_called()