"""Catalog endpoint integration: per-table metadata enrichment for remote rows. Post-0.50 the catalog endpoint reads enrichment fields exclusively from the persistent ``bq_metadata_cache`` table (populated by the scheduler- driven refresh in ``app/api/bq_metadata_refresh.py``). These tests pre-seed cache rows and verify the catalog response shape; they do NOT mock ``connectors.bigquery.metadata.fetch`` because that path is no longer reachable from the catalog request. """ from unittest.mock import patch def _register_table(seeded_app, **kwargs): """Register a table into the test DB using TableRegistryRepository.""" from src.db import get_system_db from src.repositories.table_registry import TableRegistryRepository conn = get_system_db() try: repo = TableRegistryRepository(conn) name = kwargs.pop("name", kwargs.get("id")) repo.register(name=name, **kwargs) finally: conn.close() def _seed_cache_row( table_id: str, *, rows=None, size_bytes=None, partition_by=None, clustered_by=None, ): """Insert a successful refresh row into bq_metadata_cache.""" from src.db import get_system_db from src.repositories.bq_metadata_cache import BqMetadataCacheRepository conn = get_system_db() try: BqMetadataCacheRepository(conn).upsert_success( table_id, rows=rows, size_bytes=size_bytes, partition_by=partition_by, clustered_by=clustered_by, ) finally: conn.close() def _reset_catalog_caches(): from app.api import v2_catalog v2_catalog._table_rows_cache.clear() def test_remote_row_includes_metadata_fields(seeded_app): """Catalog response for a query_mode='remote' BQ row carries the four enrichment fields read from the persistent cache.""" _reset_catalog_caches() c = seeded_app["client"] token = seeded_app["admin_token"] _register_table( seeded_app, id="orders", source_type="bigquery", bucket="dwh_base", source_table="orders_2024", query_mode="remote", ) _seed_cache_row( "orders", rows=10000, size_bytes=2_000_000, partition_by="event_date", clustered_by=["country", "platform"], ) r = c.get( "/api/v2/catalog", headers={"Authorization": f"Bearer {token}"}, ) assert r.status_code == 200, r.text tables = r.json()["tables"] orders = next(t for t in tables if t["id"] == "orders") assert orders["rows"] == 10000 assert orders["size_bytes"] == 2_000_000 assert orders["partition_by"] == "event_date" assert orders["clustered_by"] == ["country", "platform"] assert orders["query_mode"] == "remote" assert orders["metadata_freshness"] == "fresh" def test_remote_row_with_no_cache_returns_null_fields(seeded_app): """Catalog response for a remote row with no cache entry — first boot before scheduler tick — returns null enrichment fields and metadata_freshness='never_fetched'. MUST stay 200; MUST NOT call BQ.""" _reset_catalog_caches() c = seeded_app["client"] token = seeded_app["admin_token"] _register_table( seeded_app, id="cold_t", source_type="bigquery", bucket="dwh_base", source_table="cold_t", query_mode="remote", ) # Patch the BQ provider so we can prove the request path never reaches it. with patch("connectors.bigquery.metadata.fetch") as mock_fetch: r = c.get( "/api/v2/catalog", headers={"Authorization": f"Bearer {token}"}, ) assert r.status_code == 200, r.text mock_fetch.assert_not_called() tables = r.json()["tables"] cold = next(t for t in tables if t["id"] == "cold_t") assert cold["rows"] is None assert cold["size_bytes"] is None assert cold["partition_by"] is None assert cold["clustered_by"] == [] assert cold["metadata_freshness"] == "never_fetched" def test_local_row_metadata_freshness_is_not_applicable(seeded_app): """query_mode='local' rows take the parquet-stat path; the freshness field signals that the BQ cache concept doesn't apply.""" _reset_catalog_caches() c = seeded_app["client"] token = seeded_app["admin_token"] _register_table( seeded_app, id="users", source_type="keboola", bucket="in.c-crm", source_table="users", query_mode="local", ) r = c.get( "/api/v2/catalog", headers={"Authorization": f"Bearer {token}"}, ) assert r.status_code == 200, r.text tables = r.json()["tables"] users = next(t for t in tables if t["id"] == "users") assert users["metadata_freshness"] == "not_applicable" def test_zero_size_bytes_reports_small_not_unknown(seeded_app): """Devin Review #1 regression preserved across the refactor: a cache row with size_bytes=0 must surface rough_size_hint='small', not None. """ _reset_catalog_caches() c = seeded_app["client"] token = seeded_app["admin_token"] _register_table( seeded_app, id="empty_t", source_type="bigquery", bucket="dwh_base", source_table="empty_t", query_mode="remote", ) _seed_cache_row("empty_t", rows=0, size_bytes=0, clustered_by=[]) r = c.get( "/api/v2/catalog", headers={"Authorization": f"Bearer {token}"}, ) assert r.status_code == 200, r.text tables = r.json()["tables"] empty = next(t for t in tables if t["id"] == "empty_t") assert empty["size_bytes"] == 0 assert empty["rough_size_hint"] == "small" def test_catalog_request_never_calls_bq(seeded_app): """The whole point of the refactor: even with a cold cache and a remote BQ row in the registry, GET /api/v2/catalog MUST NOT touch the BQ provider. Regressing this re-introduces the >90 s hang.""" _reset_catalog_caches() c = seeded_app["client"] token = seeded_app["admin_token"] _register_table( seeded_app, id="orders", source_type="bigquery", bucket="dwh_base", source_table="orders_2024", query_mode="remote", ) with patch("connectors.bigquery.metadata.fetch") as mock_fetch: c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"}) c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"}) mock_fetch.assert_not_called()