agnes-the-ai-analyst/tests/test_v2_catalog_remote_metadata.py
ZdenekSrotyr b3841f5b6c release: 0.50.0 — persistent BQ metadata cache + scheduled refresh; catalog never blocks on BigQuery
Since 0.47.0 GET /api/v2/catalog enriched each remote BigQuery row by
fetching INFORMATION_SCHEMA.TABLE_STORAGE + COLUMNS through the DuckDB
BigQuery extension *inside the request*. On cold caches that fanned out
to O(N) sequential BQ jobs-API roundtrips — easily 90 s+ on partitioned
/ view-backed tables — and reliably blew the CLI's 30 s httpx
ReadTimeout. Reproduced with py-spy: three AnyIO worker threads stuck
inside connectors/bigquery/metadata._fetch_via_legacy_tables.

Refactor: enrichment is read exclusively from a new persistent
bq_metadata_cache DuckDB table (schema v40), populated by a scheduler-
driven refresh job at SCHEDULER_BQ_METADATA_REFRESH_INTERVAL (default
4 h). Cold catalog response on a fresh container is now tens of
milliseconds with metadata_freshness=never_fetched for unwarmed rows.

New surface:
  - POST /api/admin/run-bq-metadata-refresh (scheduler-driven, full)
  - POST /api/v2/metadata-cache/refresh?table=<id> (admin, single)
  - GET  /api/v2/metadata-cache/status (auth, non-admin)
  - metadata_freshness field per catalog row

Removed (internal API): v2_catalog._size_hint_for_row,
_resolve_remote_metadata, _metadata_provider_for,
_build_metadata_request, _materialized_size_hint, in-memory
_metadata_cache. Response shape unchanged for external consumers.

991 tests passing; 2 pre-existing failures (test_db v3→v4 ladder,
test_cli_binary_rename) unrelated to this change.
2026-05-11 20:37:17 +02:00

190 lines
6.2 KiB
Python

"""Catalog endpoint integration: per-table metadata enrichment for
remote rows.
Post-0.50 the catalog endpoint reads enrichment fields exclusively from
the persistent ``bq_metadata_cache`` table (populated by the scheduler-
driven refresh in ``app/api/bq_metadata_refresh.py``). These tests
pre-seed cache rows and verify the catalog response shape; they do NOT
mock ``connectors.bigquery.metadata.fetch`` because that path is no
longer reachable from the catalog request.
"""
from unittest.mock import patch
def _register_table(seeded_app, **kwargs):
"""Register a table into the test DB using TableRegistryRepository."""
from src.db import get_system_db
from src.repositories.table_registry import TableRegistryRepository
conn = get_system_db()
try:
repo = TableRegistryRepository(conn)
name = kwargs.pop("name", kwargs.get("id"))
repo.register(name=name, **kwargs)
finally:
conn.close()
def _seed_cache_row(
table_id: str,
*,
rows=None,
size_bytes=None,
partition_by=None,
clustered_by=None,
):
"""Insert a successful refresh row into bq_metadata_cache."""
from src.db import get_system_db
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
conn = get_system_db()
try:
BqMetadataCacheRepository(conn).upsert_success(
table_id,
rows=rows,
size_bytes=size_bytes,
partition_by=partition_by,
clustered_by=clustered_by,
)
finally:
conn.close()
def _reset_catalog_caches():
from app.api import v2_catalog
v2_catalog._table_rows_cache.clear()
def test_remote_row_includes_metadata_fields(seeded_app):
"""Catalog response for a query_mode='remote' BQ row carries the four
enrichment fields read from the persistent cache."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="orders", source_type="bigquery", bucket="dwh_base",
source_table="orders_2024", query_mode="remote",
)
_seed_cache_row(
"orders",
rows=10000, size_bytes=2_000_000,
partition_by="event_date", clustered_by=["country", "platform"],
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
tables = r.json()["tables"]
orders = next(t for t in tables if t["id"] == "orders")
assert orders["rows"] == 10000
assert orders["size_bytes"] == 2_000_000
assert orders["partition_by"] == "event_date"
assert orders["clustered_by"] == ["country", "platform"]
assert orders["query_mode"] == "remote"
assert orders["metadata_freshness"] == "fresh"
def test_remote_row_with_no_cache_returns_null_fields(seeded_app):
"""Catalog response for a remote row with no cache entry — first boot
before scheduler tick — returns null enrichment fields and
metadata_freshness='never_fetched'. MUST stay 200; MUST NOT call BQ."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="cold_t", source_type="bigquery", bucket="dwh_base",
source_table="cold_t", query_mode="remote",
)
# Patch the BQ provider so we can prove the request path never reaches it.
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
mock_fetch.assert_not_called()
tables = r.json()["tables"]
cold = next(t for t in tables if t["id"] == "cold_t")
assert cold["rows"] is None
assert cold["size_bytes"] is None
assert cold["partition_by"] is None
assert cold["clustered_by"] == []
assert cold["metadata_freshness"] == "never_fetched"
def test_local_row_metadata_freshness_is_not_applicable(seeded_app):
"""query_mode='local' rows take the parquet-stat path; the freshness
field signals that the BQ cache concept doesn't apply."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="users", source_type="keboola", bucket="in.c-crm",
source_table="users", query_mode="local",
)
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
tables = r.json()["tables"]
users = next(t for t in tables if t["id"] == "users")
assert users["metadata_freshness"] == "not_applicable"
def test_zero_size_bytes_reports_small_not_unknown(seeded_app):
"""Devin Review #1 regression preserved across the refactor: a cache
row with size_bytes=0 must surface rough_size_hint='small', not None.
"""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="empty_t", source_type="bigquery", bucket="dwh_base",
source_table="empty_t", query_mode="remote",
)
_seed_cache_row("empty_t", rows=0, size_bytes=0, clustered_by=[])
r = c.get(
"/api/v2/catalog",
headers={"Authorization": f"Bearer {token}"},
)
assert r.status_code == 200, r.text
tables = r.json()["tables"]
empty = next(t for t in tables if t["id"] == "empty_t")
assert empty["size_bytes"] == 0
assert empty["rough_size_hint"] == "small"
def test_catalog_request_never_calls_bq(seeded_app):
"""The whole point of the refactor: even with a cold cache and a
remote BQ row in the registry, GET /api/v2/catalog MUST NOT touch
the BQ provider. Regressing this re-introduces the >90 s hang."""
_reset_catalog_caches()
c = seeded_app["client"]
token = seeded_app["admin_token"]
_register_table(
seeded_app,
id="orders", source_type="bigquery", bucket="dwh_base",
source_table="orders_2024", query_mode="remote",
)
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
mock_fetch.assert_not_called()