Since 0.47.0 GET /api/v2/catalog enriched each remote BigQuery row by fetching INFORMATION_SCHEMA.TABLE_STORAGE + COLUMNS through the DuckDB BigQuery extension *inside the request*. On cold caches that fanned out to O(N) sequential BQ jobs-API roundtrips — easily 90 s+ on partitioned / view-backed tables — and reliably blew the CLI's 30 s httpx ReadTimeout. Reproduced with py-spy: three AnyIO worker threads stuck inside connectors/bigquery/metadata._fetch_via_legacy_tables. Refactor: enrichment is read exclusively from a new persistent bq_metadata_cache DuckDB table (schema v40), populated by a scheduler- driven refresh job at SCHEDULER_BQ_METADATA_REFRESH_INTERVAL (default 4 h). Cold catalog response on a fresh container is now tens of milliseconds with metadata_freshness=never_fetched for unwarmed rows. New surface: - POST /api/admin/run-bq-metadata-refresh (scheduler-driven, full) - POST /api/v2/metadata-cache/refresh?table=<id> (admin, single) - GET /api/v2/metadata-cache/status (auth, non-admin) - metadata_freshness field per catalog row Removed (internal API): v2_catalog._size_hint_for_row, _resolve_remote_metadata, _metadata_provider_for, _build_metadata_request, _materialized_size_hint, in-memory _metadata_cache. Response shape unchanged for external consumers. 991 tests passing; 2 pre-existing failures (test_db v3→v4 ladder, test_cli_binary_rename) unrelated to this change.
190 lines
6.2 KiB
Python
190 lines
6.2 KiB
Python
"""Catalog endpoint integration: per-table metadata enrichment for
|
|
remote rows.
|
|
|
|
Post-0.50 the catalog endpoint reads enrichment fields exclusively from
|
|
the persistent ``bq_metadata_cache`` table (populated by the scheduler-
|
|
driven refresh in ``app/api/bq_metadata_refresh.py``). These tests
|
|
pre-seed cache rows and verify the catalog response shape; they do NOT
|
|
mock ``connectors.bigquery.metadata.fetch`` because that path is no
|
|
longer reachable from the catalog request.
|
|
"""
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
def _register_table(seeded_app, **kwargs):
|
|
"""Register a table into the test DB using TableRegistryRepository."""
|
|
from src.db import get_system_db
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
conn = get_system_db()
|
|
try:
|
|
repo = TableRegistryRepository(conn)
|
|
name = kwargs.pop("name", kwargs.get("id"))
|
|
repo.register(name=name, **kwargs)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def _seed_cache_row(
|
|
table_id: str,
|
|
*,
|
|
rows=None,
|
|
size_bytes=None,
|
|
partition_by=None,
|
|
clustered_by=None,
|
|
):
|
|
"""Insert a successful refresh row into bq_metadata_cache."""
|
|
from src.db import get_system_db
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
conn = get_system_db()
|
|
try:
|
|
BqMetadataCacheRepository(conn).upsert_success(
|
|
table_id,
|
|
rows=rows,
|
|
size_bytes=size_bytes,
|
|
partition_by=partition_by,
|
|
clustered_by=clustered_by,
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def _reset_catalog_caches():
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.clear()
|
|
|
|
|
|
def test_remote_row_includes_metadata_fields(seeded_app):
|
|
"""Catalog response for a query_mode='remote' BQ row carries the four
|
|
enrichment fields read from the persistent cache."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
_register_table(
|
|
seeded_app,
|
|
id="orders", source_type="bigquery", bucket="dwh_base",
|
|
source_table="orders_2024", query_mode="remote",
|
|
)
|
|
_seed_cache_row(
|
|
"orders",
|
|
rows=10000, size_bytes=2_000_000,
|
|
partition_by="event_date", clustered_by=["country", "platform"],
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
orders = next(t for t in tables if t["id"] == "orders")
|
|
assert orders["rows"] == 10000
|
|
assert orders["size_bytes"] == 2_000_000
|
|
assert orders["partition_by"] == "event_date"
|
|
assert orders["clustered_by"] == ["country", "platform"]
|
|
assert orders["query_mode"] == "remote"
|
|
assert orders["metadata_freshness"] == "fresh"
|
|
|
|
|
|
def test_remote_row_with_no_cache_returns_null_fields(seeded_app):
|
|
"""Catalog response for a remote row with no cache entry — first boot
|
|
before scheduler tick — returns null enrichment fields and
|
|
metadata_freshness='never_fetched'. MUST stay 200; MUST NOT call BQ."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="cold_t", source_type="bigquery", bucket="dwh_base",
|
|
source_table="cold_t", query_mode="remote",
|
|
)
|
|
|
|
# Patch the BQ provider so we can prove the request path never reaches it.
|
|
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
mock_fetch.assert_not_called()
|
|
|
|
tables = r.json()["tables"]
|
|
cold = next(t for t in tables if t["id"] == "cold_t")
|
|
assert cold["rows"] is None
|
|
assert cold["size_bytes"] is None
|
|
assert cold["partition_by"] is None
|
|
assert cold["clustered_by"] == []
|
|
assert cold["metadata_freshness"] == "never_fetched"
|
|
|
|
|
|
def test_local_row_metadata_freshness_is_not_applicable(seeded_app):
|
|
"""query_mode='local' rows take the parquet-stat path; the freshness
|
|
field signals that the BQ cache concept doesn't apply."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="users", source_type="keboola", bucket="in.c-crm",
|
|
source_table="users", query_mode="local",
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
users = next(t for t in tables if t["id"] == "users")
|
|
assert users["metadata_freshness"] == "not_applicable"
|
|
|
|
|
|
def test_zero_size_bytes_reports_small_not_unknown(seeded_app):
|
|
"""Devin Review #1 regression preserved across the refactor: a cache
|
|
row with size_bytes=0 must surface rough_size_hint='small', not None.
|
|
"""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="empty_t", source_type="bigquery", bucket="dwh_base",
|
|
source_table="empty_t", query_mode="remote",
|
|
)
|
|
_seed_cache_row("empty_t", rows=0, size_bytes=0, clustered_by=[])
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
empty = next(t for t in tables if t["id"] == "empty_t")
|
|
assert empty["size_bytes"] == 0
|
|
assert empty["rough_size_hint"] == "small"
|
|
|
|
|
|
def test_catalog_request_never_calls_bq(seeded_app):
|
|
"""The whole point of the refactor: even with a cold cache and a
|
|
remote BQ row in the registry, GET /api/v2/catalog MUST NOT touch
|
|
the BQ provider. Regressing this re-introduces the >90 s hang."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="orders", source_type="bigquery", bucket="dwh_base",
|
|
source_table="orders_2024", query_mode="remote",
|
|
)
|
|
|
|
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
|
|
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
|
|
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
|
|
|
|
mock_fetch.assert_not_called()
|