Since 0.47.0 GET /api/v2/catalog enriched each remote BigQuery row by fetching INFORMATION_SCHEMA.TABLE_STORAGE + COLUMNS through the DuckDB BigQuery extension *inside the request*. On cold caches that fanned out to O(N) sequential BQ jobs-API roundtrips — easily 90 s+ on partitioned / view-backed tables — and reliably blew the CLI's 30 s httpx ReadTimeout. Reproduced with py-spy: three AnyIO worker threads stuck inside connectors/bigquery/metadata._fetch_via_legacy_tables. Refactor: enrichment is read exclusively from a new persistent bq_metadata_cache DuckDB table (schema v40), populated by a scheduler- driven refresh job at SCHEDULER_BQ_METADATA_REFRESH_INTERVAL (default 4 h). Cold catalog response on a fresh container is now tens of milliseconds with metadata_freshness=never_fetched for unwarmed rows. New surface: - POST /api/admin/run-bq-metadata-refresh (scheduler-driven, full) - POST /api/v2/metadata-cache/refresh?table=<id> (admin, single) - GET /api/v2/metadata-cache/status (auth, non-admin) - metadata_freshness field per catalog row Removed (internal API): v2_catalog._size_hint_for_row, _resolve_remote_metadata, _metadata_provider_for, _build_metadata_request, _materialized_size_hint, in-memory _metadata_cache. Response shape unchanged for external consumers. 991 tests passing; 2 pre-existing failures (test_db v3→v4 ladder, test_cli_binary_rename) unrelated to this change.
110 lines
3.8 KiB
Python
110 lines
3.8 KiB
Python
"""Unified cache flush across the three in-memory catalog/schema/sample
|
|
caches on registry write.
|
|
|
|
Post-0.50: the persistent ``bq_metadata_cache`` is intentionally NOT
|
|
invalidated here. That table's lifecycle is owned by the scheduler-
|
|
driven refresh — admins who need an immediate refresh after editing a
|
|
remote row hit ``POST /api/v2/metadata-cache/refresh?table=<id>``
|
|
explicitly. Auto-invalidation on every registry edit would re-introduce
|
|
the request-path BQ fan-out the refactor exists to avoid.
|
|
"""
|
|
|
|
from src.db import get_system_db
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
|
|
|
|
def test_invalidate_flushes_three_in_memory_caches():
|
|
from app.api import v2_catalog, v2_schema, v2_sample
|
|
|
|
# Pre-populate.
|
|
v2_catalog._table_rows_cache.set("all", ["fake_row"])
|
|
v2_schema._schema_cache.set("orders", {"columns": []})
|
|
v2_sample._sample_cache.set("orders|10", [{"row": 1}])
|
|
|
|
v2_catalog.invalidate_for_table("orders")
|
|
|
|
assert v2_catalog._table_rows_cache.get("all") is None
|
|
assert v2_schema._schema_cache.get("orders") is None
|
|
# Sample cache is cleared whole (we don't have prefix-invalidation).
|
|
assert v2_sample._sample_cache.get("orders|10") is None
|
|
|
|
|
|
def test_invalidate_does_not_touch_persistent_bq_cache():
|
|
"""The persistent cache survives registry-row invalidations; only an
|
|
explicit ``POST /api/v2/metadata-cache/refresh`` (or the scheduled
|
|
refresh) should change it."""
|
|
from app.api import v2_catalog
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
BqMetadataCacheRepository(conn).upsert_success(
|
|
"survives_invalidate",
|
|
rows=42, size_bytes=4096, partition_by=None, clustered_by=None,
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
v2_catalog.invalidate_for_table("survives_invalidate")
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
row = BqMetadataCacheRepository(conn).get("survives_invalidate")
|
|
finally:
|
|
conn.close()
|
|
assert row is not None
|
|
assert row["rows"] == 42
|
|
|
|
|
|
def test_register_table_invalidates(seeded_app):
|
|
"""Registering a table flushes the rows cache so the next catalog
|
|
request reflects it without waiting for the 5-min TTL."""
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.set("all", [])
|
|
|
|
client = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
headers = {"Authorization": f"Bearer {token}"}
|
|
client.post("/api/admin/register-table", json={
|
|
"name": "new_t",
|
|
"source_type": "keboola",
|
|
"bucket": "in.c-x",
|
|
"source_table": "t",
|
|
"query_mode": "local",
|
|
}, headers=headers)
|
|
assert v2_catalog._table_rows_cache.get("all") is None
|
|
|
|
|
|
def test_update_table_invalidates(seeded_app):
|
|
from app.api import v2_catalog
|
|
client = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
headers = {"Authorization": f"Bearer {token}"}
|
|
|
|
client.post("/api/admin/register-table", json={
|
|
"name": "u_t",
|
|
"source_type": "keboola",
|
|
"bucket": "in.c-x",
|
|
"source_table": "t",
|
|
"query_mode": "local",
|
|
}, headers=headers)
|
|
v2_catalog._table_rows_cache.set("all", ["pre-update"])
|
|
client.put("/api/admin/registry/u_t", json={"description": "new"}, headers=headers)
|
|
assert v2_catalog._table_rows_cache.get("all") is None
|
|
|
|
|
|
def test_unregister_table_invalidates(seeded_app):
|
|
from app.api import v2_catalog
|
|
client = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
headers = {"Authorization": f"Bearer {token}"}
|
|
|
|
client.post("/api/admin/register-table", json={
|
|
"name": "d_t",
|
|
"source_type": "keboola",
|
|
"bucket": "in.c-x",
|
|
"source_table": "t",
|
|
"query_mode": "local",
|
|
}, headers=headers)
|
|
v2_catalog._table_rows_cache.set("all", ["pre-delete"])
|
|
client.delete("/api/admin/registry/d_t", headers=headers)
|
|
assert v2_catalog._table_rows_cache.get("all") is None
|