Three behavioural improvements driven by the sub-agent end-to-end test
findings, plus scheduler tweaks to prevent the post-deploy contention
burst we measured.
CATALOG (catalog-side bugs the test agents tripped on):
- new entity_type field per remote row (BASE TABLE / VIEW /
MATERIALIZED VIEW). For views, rows + size_bytes return null
instead of the misleading 0 that __TABLES__ reports.
- where_examples now validates against the table's actual schema
(cached known_columns from refresh). The pre-fix behavior
blindly advertised `country_code = 'CZ'` on tables with no
country_code column — the sub-agent tests reliably hit this on
unit_economics.
- new known_columns + entity_type columns on bq_metadata_cache;
populated by bq_metadata_refresh.refresh_one from the same
fetch_bq_columns_full call (no extra BQ roundtrip) plus a
cheap INFORMATION_SCHEMA.TABLES lookup for table_type.
QUERY COST-GUARD:
- remote_scan_too_large suggestion now names views explicitly:
`Target(s) <ids> are VIEW or MATERIALIZED VIEW. BigQuery does
not push LIMIT into the view body — SELECT * FROM <view>
LIMIT 1 still runs the full underlying scan.` Programmatic
consumers get a new view_targets field on the error detail.
SCHEDULER HYGIENE (the post-deploy 1-minute window where
concurrent parquet downloads dropped to ~1 MB/s):
- SCHEDULER_STARTUP_GRACE_SECONDS (default 60) holds the first
tick so the burst doesn't overlap cache_warmup writes.
- SCHEDULER_BQ_METADATA_INITIAL_OFFSET_MAX_SECONDS (default 900)
randomises bq-metadata-refresh's first-fire offset.
TESTS:
- test_bq_metadata_cache_repo: entity_type + known_columns round-trip
- test_v2_catalog_remote_metadata: where_examples validation, views
return null rows/size_bytes, cold rows have empty examples
- test_api_query_guardrail: VIEW-aware suggestion text + view_targets
- test_connectors_bigquery_metadata: entity_type lookup mock + new
fields in TableMetadata expectations
- test_scheduler_sidecar: grace + jitter env-var resolution
293 lines
10 KiB
Python
293 lines
10 KiB
Python
"""Catalog endpoint integration: per-table metadata enrichment for
|
|
remote rows.
|
|
|
|
Post-0.50 the catalog endpoint reads enrichment fields exclusively from
|
|
the persistent ``bq_metadata_cache`` table (populated by the scheduler-
|
|
driven refresh in ``app/api/bq_metadata_refresh.py``). These tests
|
|
pre-seed cache rows and verify the catalog response shape; they do NOT
|
|
mock ``connectors.bigquery.metadata.fetch`` because that path is no
|
|
longer reachable from the catalog request.
|
|
"""
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
def _register_table(seeded_app, **kwargs):
|
|
"""Register a table into the test DB using TableRegistryRepository."""
|
|
from src.db import get_system_db
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
conn = get_system_db()
|
|
try:
|
|
repo = TableRegistryRepository(conn)
|
|
name = kwargs.pop("name", kwargs.get("id"))
|
|
repo.register(name=name, **kwargs)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def _seed_cache_row(
|
|
table_id: str,
|
|
*,
|
|
rows=None,
|
|
size_bytes=None,
|
|
partition_by=None,
|
|
clustered_by=None,
|
|
entity_type=None,
|
|
known_columns=None,
|
|
):
|
|
"""Insert a successful refresh row into bq_metadata_cache."""
|
|
from src.db import get_system_db
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
conn = get_system_db()
|
|
try:
|
|
BqMetadataCacheRepository(conn).upsert_success(
|
|
table_id,
|
|
rows=rows,
|
|
size_bytes=size_bytes,
|
|
partition_by=partition_by,
|
|
clustered_by=clustered_by,
|
|
entity_type=entity_type,
|
|
known_columns=known_columns,
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def _reset_catalog_caches():
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.clear()
|
|
|
|
|
|
def test_remote_row_includes_metadata_fields(seeded_app):
|
|
"""Catalog response for a query_mode='remote' BQ row carries the four
|
|
enrichment fields read from the persistent cache."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
_register_table(
|
|
seeded_app,
|
|
id="orders", source_type="bigquery", bucket="dwh_base",
|
|
source_table="orders_2024", query_mode="remote",
|
|
)
|
|
_seed_cache_row(
|
|
"orders",
|
|
rows=10000, size_bytes=2_000_000,
|
|
partition_by="event_date", clustered_by=["country", "platform"],
|
|
entity_type="BASE TABLE",
|
|
known_columns=["event_date", "country", "platform", "amount"],
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
orders = next(t for t in tables if t["id"] == "orders")
|
|
assert orders["rows"] == 10000
|
|
assert orders["size_bytes"] == 2_000_000
|
|
assert orders["partition_by"] == "event_date"
|
|
assert orders["clustered_by"] == ["country", "platform"]
|
|
assert orders["query_mode"] == "remote"
|
|
assert orders["metadata_freshness"] == "fresh"
|
|
assert orders["entity_type"] == "BASE TABLE"
|
|
# Both example templates apply: event_date present, country+platform present
|
|
assert "event_date > DATE '2026-01-01'" in orders["where_examples"]
|
|
assert "country_code = 'CZ' AND platform = 'web'" not in orders["where_examples"]
|
|
|
|
|
|
def test_where_examples_filtered_against_real_columns(seeded_app):
|
|
"""Generic where_examples that reference columns the table doesn't
|
|
have must be dropped (the pre-fix bug the test suite is designed to
|
|
catch). unit_economics-style table has event_date but no country_code."""
|
|
_reset_catalog_caches()
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="ue_like", source_type="bigquery", bucket="dwh_base",
|
|
source_table="unit_economics", query_mode="remote",
|
|
)
|
|
_seed_cache_row(
|
|
"ue_like",
|
|
rows=None, size_bytes=None,
|
|
partition_by="event_date", clustered_by=[],
|
|
entity_type="VIEW",
|
|
# Real schema: event_date present, country_code absent.
|
|
known_columns=["event_date", "order_event_id", "merchant_country"],
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
row = next(t for t in r.json()["tables"] if t["id"] == "ue_like")
|
|
# event_date example passes (column exists).
|
|
assert "event_date > DATE '2026-01-01'" in row["where_examples"]
|
|
# country_code/platform example dropped (columns missing).
|
|
assert all("country_code" not in e for e in row["where_examples"])
|
|
|
|
|
|
def test_view_returns_null_rows_and_size_bytes(seeded_app):
|
|
"""For a VIEW we keep rows/size_bytes as null even if the cache row
|
|
has them populated — pre-existing cache rows from before the
|
|
entity_type field existed will fix themselves on next refresh."""
|
|
_reset_catalog_caches()
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="ue_view", source_type="bigquery", bucket="dwh_base",
|
|
source_table="ue_view", query_mode="remote",
|
|
)
|
|
# Provider would have set rows/size_bytes to None for views; we mirror
|
|
# that contract here in the cache row.
|
|
_seed_cache_row(
|
|
"ue_view", rows=None, size_bytes=None,
|
|
partition_by=None, clustered_by=[],
|
|
entity_type="VIEW",
|
|
known_columns=["event_date"],
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
row = next(t for t in r.json()["tables"] if t["id"] == "ue_view")
|
|
assert row["entity_type"] == "VIEW"
|
|
assert row["rows"] is None
|
|
assert row["size_bytes"] is None
|
|
assert row["rough_size_hint"] is None
|
|
|
|
|
|
def test_where_examples_empty_when_columns_unknown(seeded_app):
|
|
"""For a remote row with no cache entry yet (never_fetched), don't
|
|
advertise any where_examples — we can't validate them against an
|
|
unknown schema."""
|
|
_reset_catalog_caches()
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="unfetched", source_type="bigquery", bucket="dwh_base",
|
|
source_table="unfetched", query_mode="remote",
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
row = next(t for t in r.json()["tables"] if t["id"] == "unfetched")
|
|
assert row["metadata_freshness"] == "never_fetched"
|
|
assert row["where_examples"] == []
|
|
assert row["entity_type"] is None
|
|
|
|
|
|
def test_remote_row_with_no_cache_returns_null_fields(seeded_app):
|
|
"""Catalog response for a remote row with no cache entry — first boot
|
|
before scheduler tick — returns null enrichment fields and
|
|
metadata_freshness='never_fetched'. MUST stay 200; MUST NOT call BQ."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="cold_t", source_type="bigquery", bucket="dwh_base",
|
|
source_table="cold_t", query_mode="remote",
|
|
)
|
|
|
|
# Patch the BQ provider so we can prove the request path never reaches it.
|
|
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
mock_fetch.assert_not_called()
|
|
|
|
tables = r.json()["tables"]
|
|
cold = next(t for t in tables if t["id"] == "cold_t")
|
|
assert cold["rows"] is None
|
|
assert cold["size_bytes"] is None
|
|
assert cold["partition_by"] is None
|
|
assert cold["clustered_by"] == []
|
|
assert cold["metadata_freshness"] == "never_fetched"
|
|
|
|
|
|
def test_local_row_metadata_freshness_is_not_applicable(seeded_app):
|
|
"""query_mode='local' rows take the parquet-stat path; the freshness
|
|
field signals that the BQ cache concept doesn't apply."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="users", source_type="keboola", bucket="in.c-crm",
|
|
source_table="users", query_mode="local",
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
users = next(t for t in tables if t["id"] == "users")
|
|
assert users["metadata_freshness"] == "not_applicable"
|
|
|
|
|
|
def test_zero_size_bytes_reports_small_not_unknown(seeded_app):
|
|
"""Devin Review #1 regression preserved across the refactor: a cache
|
|
row with size_bytes=0 must surface rough_size_hint='small', not None.
|
|
"""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="empty_t", source_type="bigquery", bucket="dwh_base",
|
|
source_table="empty_t", query_mode="remote",
|
|
)
|
|
_seed_cache_row(
|
|
"empty_t", rows=0, size_bytes=0, clustered_by=[],
|
|
entity_type="BASE TABLE", known_columns=["event_date"],
|
|
)
|
|
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
empty = next(t for t in tables if t["id"] == "empty_t")
|
|
assert empty["size_bytes"] == 0
|
|
assert empty["rough_size_hint"] == "small"
|
|
|
|
|
|
def test_catalog_request_never_calls_bq(seeded_app):
|
|
"""The whole point of the refactor: even with a cold cache and a
|
|
remote BQ row in the registry, GET /api/v2/catalog MUST NOT touch
|
|
the BQ provider. Regressing this re-introduces the >90 s hang."""
|
|
_reset_catalog_caches()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="orders", source_type="bigquery", bucket="dwh_base",
|
|
source_table="orders_2024", query_mode="remote",
|
|
)
|
|
|
|
with patch("connectors.bigquery.metadata.fetch") as mock_fetch:
|
|
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
|
|
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
|
|
|
|
mock_fetch.assert_not_called()
|