Since 0.47.0 GET /api/v2/catalog enriched each remote BigQuery row by fetching INFORMATION_SCHEMA.TABLE_STORAGE + COLUMNS through the DuckDB BigQuery extension *inside the request*. On cold caches that fanned out to O(N) sequential BQ jobs-API roundtrips — easily 90 s+ on partitioned / view-backed tables — and reliably blew the CLI's 30 s httpx ReadTimeout. Reproduced with py-spy: three AnyIO worker threads stuck inside connectors/bigquery/metadata._fetch_via_legacy_tables. Refactor: enrichment is read exclusively from a new persistent bq_metadata_cache DuckDB table (schema v40), populated by a scheduler- driven refresh job at SCHEDULER_BQ_METADATA_REFRESH_INTERVAL (default 4 h). Cold catalog response on a fresh container is now tens of milliseconds with metadata_freshness=never_fetched for unwarmed rows. New surface: - POST /api/admin/run-bq-metadata-refresh (scheduler-driven, full) - POST /api/v2/metadata-cache/refresh?table=<id> (admin, single) - GET /api/v2/metadata-cache/status (auth, non-admin) - metadata_freshness field per catalog row Removed (internal API): v2_catalog._size_hint_for_row, _resolve_remote_metadata, _metadata_provider_for, _build_metadata_request, _materialized_size_hint, in-memory _metadata_cache. Response shape unchanged for external consumers. 991 tests passing; 2 pre-existing failures (test_db v3→v4 ladder, test_cli_binary_rename) unrelated to this change.
160 lines
5.3 KiB
Python
160 lines
5.3 KiB
Python
"""Repository + freshness tests for the persistent BQ metadata cache."""
|
|
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
import pytest
|
|
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
|
|
|
|
def test_upsert_success_inserts_then_updates(seeded_app):
|
|
from src.db import get_system_db
|
|
conn = get_system_db()
|
|
try:
|
|
repo = BqMetadataCacheRepository(conn)
|
|
repo.upsert_success(
|
|
"orders", rows=10, size_bytes=2048,
|
|
partition_by="event_date", clustered_by=["country"],
|
|
)
|
|
row = repo.get("orders")
|
|
assert row is not None
|
|
assert row["rows"] == 10
|
|
assert row["size_bytes"] == 2048
|
|
assert row["partition_by"] == "event_date"
|
|
assert row["clustered_by"] == ["country"]
|
|
assert row["refreshed_at"] is not None
|
|
assert row["error_at"] is None
|
|
|
|
# Update with new numbers; refreshed_at advances.
|
|
first_refresh = row["refreshed_at"]
|
|
repo.upsert_success(
|
|
"orders", rows=20, size_bytes=4096,
|
|
partition_by=None, clustered_by=[],
|
|
)
|
|
row2 = repo.get("orders")
|
|
assert row2["rows"] == 20
|
|
assert row2["partition_by"] is None
|
|
assert row2["clustered_by"] == []
|
|
assert row2["refreshed_at"] >= first_refresh
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_mark_error_preserves_prior_success(seeded_app):
|
|
"""After a successful refresh, a subsequent failure must keep the
|
|
rows/size_bytes columns untouched — analyst Claude keeps using the
|
|
last-known-good numbers while the next scheduled retry attempts to
|
|
recover."""
|
|
from src.db import get_system_db
|
|
conn = get_system_db()
|
|
try:
|
|
repo = BqMetadataCacheRepository(conn)
|
|
repo.upsert_success(
|
|
"orders", rows=100, size_bytes=1000,
|
|
partition_by=None, clustered_by=None,
|
|
)
|
|
repo.mark_error("orders", "BQ timeout")
|
|
row = repo.get("orders")
|
|
assert row["rows"] == 100, "prior success must be preserved across error"
|
|
assert row["size_bytes"] == 1000
|
|
assert row["error_at"] is not None
|
|
assert row["error_msg"] == "BQ timeout"
|
|
# Subsequent success clears the error.
|
|
repo.upsert_success(
|
|
"orders", rows=200, size_bytes=2000,
|
|
partition_by=None, clustered_by=None,
|
|
)
|
|
row2 = repo.get("orders")
|
|
assert row2["rows"] == 200
|
|
assert row2["error_at"] is None
|
|
assert row2["error_msg"] is None
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_mark_error_truncates_long_messages(seeded_app):
|
|
from src.db import get_system_db
|
|
conn = get_system_db()
|
|
try:
|
|
repo = BqMetadataCacheRepository(conn)
|
|
repo.mark_error("orders", "x" * 2000)
|
|
row = repo.get("orders")
|
|
assert len(row["error_msg"]) == 512
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_list_all_orders_by_table_id(seeded_app):
|
|
from src.db import get_system_db
|
|
conn = get_system_db()
|
|
try:
|
|
repo = BqMetadataCacheRepository(conn)
|
|
repo.upsert_success(
|
|
"zeta", rows=1, size_bytes=1, partition_by=None, clustered_by=None,
|
|
)
|
|
repo.upsert_success(
|
|
"alpha", rows=2, size_bytes=2, partition_by=None, clustered_by=None,
|
|
)
|
|
rows = repo.list_all()
|
|
ids = [r["table_id"] for r in rows]
|
|
assert ids == sorted(ids)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_delete_removes_row(seeded_app):
|
|
from src.db import get_system_db
|
|
conn = get_system_db()
|
|
try:
|
|
repo = BqMetadataCacheRepository(conn)
|
|
repo.upsert_success(
|
|
"orders", rows=1, size_bytes=1, partition_by=None, clustered_by=None,
|
|
)
|
|
repo.delete("orders")
|
|
assert repo.get("orders") is None
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
# ─── compute_freshness ────────────────────────────────────────────────────
|
|
|
|
|
|
def test_freshness_never_fetched_for_missing_row():
|
|
from app.api.bq_metadata_refresh import compute_freshness
|
|
assert compute_freshness(None) == "never_fetched"
|
|
|
|
|
|
def test_freshness_never_fetched_for_no_refresh_no_error():
|
|
from app.api.bq_metadata_refresh import compute_freshness
|
|
row = {"refreshed_at": None, "error_at": None}
|
|
assert compute_freshness(row) == "never_fetched"
|
|
|
|
|
|
def test_freshness_error_when_only_error_present():
|
|
from app.api.bq_metadata_refresh import compute_freshness
|
|
row = {
|
|
"refreshed_at": None,
|
|
"error_at": datetime.now(timezone.utc),
|
|
}
|
|
assert compute_freshness(row) == "error"
|
|
|
|
|
|
def test_freshness_fresh_within_threshold():
|
|
from app.api.bq_metadata_refresh import compute_freshness
|
|
now = datetime.now(timezone.utc)
|
|
row = {
|
|
"refreshed_at": now - timedelta(seconds=60),
|
|
"error_at": None,
|
|
}
|
|
# 1-minute-old row with a 1-hour threshold ⇒ fresh.
|
|
assert compute_freshness(row, now=now, fresh_threshold=3600) == "fresh"
|
|
|
|
|
|
def test_freshness_stale_beyond_threshold():
|
|
from app.api.bq_metadata_refresh import compute_freshness
|
|
now = datetime.now(timezone.utc)
|
|
row = {
|
|
"refreshed_at": now - timedelta(hours=10),
|
|
"error_at": None,
|
|
}
|
|
assert compute_freshness(row, now=now, fresh_threshold=3600) == "stale"
|