## Summary
- Catalog enrichment for `query_mode='remote'` rows: `rows`, `size_bytes`, `partition_by`, `clustered_by` per table (BQ + Keboola providers).
- `/api/v2/schema/{id}` cache miss: 2 BQ jobs → 1 (-50%) via shared `fetch_bq_columns_full`.
- All four catalog/schema/sample/metadata caches flush on registry change; single-row re-warm scheduled.
- Automatic cache warmup at server startup (bounded concurrency, opt-out via `AGNES_SKIP_CACHE_WARMUP=1`).
- SSE-driven freshness toolbar on `/admin/tables` with progress bar, log, and per-row badge.
- New admin doc `docs/admin/query-modes.md` — single source of truth on `local` / `remote` / `materialized` choice.
Closes #155.
Closes #156.
## Test plan
- [x] 65+ targeted tests pass across 11 new test modules + 3 modified ones.
- [x] No DB migration; no wire-break; `MIN_COMPAT_CLI_VERSION` unchanged.
- [ ] Reviewer: register a remote BQ table via `/admin/tables`, observe the toolbar populates within ~2 s and the per-row badge transitions warming → fresh.
- [ ] Reviewer: trigger `Re-warm all`, verify SSE log scrolls and `cacheWarmupBar` progresses.
- [ ] Reviewer: edit a registered row's bucket, verify `agnes schema <id>` returns updated columns immediately (no 1-hour staleness).
- [ ] Reviewer: confirm `agnes admin register-table --query-mode remote` prints the new IAM-smoke-check hint.
## Notable design decisions
- BigQuery `INFORMATION_SCHEMA.TABLE_STORAGE` is the only valid scope for size+rows (verified live 2026-05-07; dataset-scoped doesn't exist). Region resolved from `instance.yaml.data_source.bigquery.location` → `bq.client().get_dataset(...)` → fall back to legacy `__TABLES__`.
- VIEW handling: TABLE_STORAGE returns no rows for views, fall through to `__TABLES__` (also empty) → `TableMetadata(rows=None, size_bytes=None, partition_by=..., clustered_by=...)`. Null size signals analyst Claude to apply existing CLAUDE.md guidance.
- `size_bytes` is `active_logical_bytes + long_term_logical_bytes` — full BQ scan reads both; reporting only active undercounts aged partitioned tables.
- Source-agnostic provider seam: per-source `connectors/<source>/metadata.py:fetch(MetadataRequest)`; dispatcher in `app/api/v2_catalog.py:_metadata_provider_for` lazily imports per source_type so a Keboola-only deployment doesn't pay the BQ-extension import cost.
- Warmup non-blocking: FastAPI `lifespan` schedules `asyncio.create_task(_warm_catalog_caches_bg)` before `yield`. Per-row failures isolated.
## Out of scope
- Profile / column histograms / dimension cardinality for remote tables (separate issue).
- Onboarding nudge ("you have 0 remote tables, consider registering some BQ ones") — separate UX call.
- Provider plug-in registration via entry-points (the dispatch table is a hardcoded if-tree today; one line per future source).
## Release
Bumps `pyproject.toml` 0.46.1 → 0.47.0 (main shipped 0.46.0 + 0.46.1 during this PR — see commit `d98976ec`). New CHANGELOG section under `## [0.47.0] — 2026-05-07`.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
<!-- devin-review-badge-begin -->
---
<a href="https://app.devin.ai/review/keboola/agnes-the-ai-analyst/pull/223" target="_blank">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1">
<img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open in Devin Review">
</picture>
</a>
<!-- devin-review-badge-end -->
179 lines
6 KiB
Python
179 lines
6 KiB
Python
"""Catalog endpoint integration: per-table metadata enrichment for
|
|
remote rows."""
|
|
|
|
from unittest.mock import patch
|
|
|
|
from app.api._metadata_models import TableMetadata
|
|
|
|
|
|
def _register_table(seeded_app, **kwargs):
|
|
"""Register a table into the test DB using TableRegistryRepository."""
|
|
from src.db import get_system_db
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
conn = get_system_db()
|
|
try:
|
|
repo = TableRegistryRepository(conn)
|
|
# `name` defaults to `id` if not supplied
|
|
name = kwargs.pop("name", kwargs.get("id"))
|
|
repo.register(name=name, **kwargs)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_remote_row_includes_metadata_fields(seeded_app, monkeypatch):
|
|
"""Catalog response for a query_mode='remote' BQ row carries the four
|
|
new fields populated by the provider."""
|
|
# Reset catalog row cache so this test's registered table is visible.
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.clear()
|
|
v2_catalog._metadata_cache.clear()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
fake_meta = TableMetadata(
|
|
rows=10000, size_bytes=2_000_000,
|
|
partition_by="event_date", clustered_by=["country", "platform"],
|
|
)
|
|
|
|
_register_table(
|
|
seeded_app,
|
|
id="orders", source_type="bigquery", bucket="dwh_base",
|
|
source_table="orders_2024", query_mode="remote",
|
|
)
|
|
|
|
with patch(
|
|
"connectors.bigquery.metadata.fetch", return_value=fake_meta,
|
|
):
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
orders = next(t for t in tables if t["id"] == "orders")
|
|
assert orders["rows"] == 10000
|
|
assert orders["size_bytes"] == 2_000_000
|
|
assert orders["partition_by"] == "event_date"
|
|
assert orders["clustered_by"] == ["country", "platform"]
|
|
# Existing fields still present.
|
|
assert orders["query_mode"] == "remote"
|
|
|
|
|
|
def test_local_row_unaffected_by_provider_dispatch(seeded_app):
|
|
"""query_mode='local' rows take the parquet-stat path; provider not called."""
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.clear()
|
|
v2_catalog._metadata_cache.clear()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="users", source_type="keboola", bucket="in.c-crm",
|
|
source_table="users", query_mode="local",
|
|
)
|
|
|
|
with patch("connectors.keboola.metadata.fetch") as mock_fetch:
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
mock_fetch.assert_not_called()
|
|
|
|
|
|
def test_provider_failure_returns_null_metadata(seeded_app):
|
|
"""Provider returns None → row appears with null new fields, not
|
|
a 500. Catalog endpoint must stay 200."""
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.clear()
|
|
v2_catalog._metadata_cache.clear()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="broken", source_type="bigquery", bucket="dwh_base",
|
|
source_table="broken_t", query_mode="remote",
|
|
)
|
|
|
|
with patch(
|
|
"connectors.bigquery.metadata.fetch", return_value=None,
|
|
):
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
broken = next(t for t in tables if t["id"] == "broken")
|
|
assert broken["rows"] is None
|
|
assert broken["size_bytes"] is None
|
|
assert broken["partition_by"] is None
|
|
assert broken["clustered_by"] is None
|
|
|
|
|
|
def test_zero_size_bytes_reports_small_not_unknown(seeded_app):
|
|
"""Devin Review #1 regression: `if cached.size_bytes:` is falsy when
|
|
`size_bytes == 0` (genuinely empty table) — that wrongly emitted
|
|
`rough_size_hint=None` ("unknown") instead of `"small"` (the bucket
|
|
`_bucket_size(0)` returns).
|
|
|
|
Fix in `_size_hint_for_row`: distinguish "size known to be zero" from
|
|
"size is unknown" with `is not None`."""
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.clear()
|
|
v2_catalog._metadata_cache.clear()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
fake_meta = TableMetadata(
|
|
rows=0, size_bytes=0, partition_by=None, clustered_by=[],
|
|
)
|
|
|
|
_register_table(
|
|
seeded_app,
|
|
id="empty_t", source_type="bigquery", bucket="dwh_base",
|
|
source_table="empty_t", query_mode="remote",
|
|
)
|
|
|
|
with patch(
|
|
"connectors.bigquery.metadata.fetch", return_value=fake_meta,
|
|
):
|
|
r = c.get(
|
|
"/api/v2/catalog",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
tables = r.json()["tables"]
|
|
empty = next(t for t in tables if t["id"] == "empty_t")
|
|
# The whole point of this test: 0 bytes is NOT "unknown".
|
|
assert empty["size_bytes"] == 0
|
|
assert empty["rough_size_hint"] == "small", (
|
|
f"size_bytes=0 should bucket to 'small', got {empty['rough_size_hint']}"
|
|
)
|
|
|
|
|
|
def test_cache_hit_does_not_call_provider_twice(seeded_app):
|
|
"""First call invokes provider; second within 15 min hits cache."""
|
|
from app.api import v2_catalog
|
|
v2_catalog._table_rows_cache.clear()
|
|
v2_catalog._metadata_cache.clear()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
_register_table(
|
|
seeded_app,
|
|
id="orders", source_type="bigquery", bucket="dwh_base",
|
|
source_table="orders_2024", query_mode="remote",
|
|
)
|
|
|
|
fake_meta = TableMetadata(rows=1, size_bytes=2)
|
|
with patch(
|
|
"connectors.bigquery.metadata.fetch", return_value=fake_meta,
|
|
) as mock_fetch:
|
|
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
|
|
c.get("/api/v2/catalog", headers={"Authorization": f"Bearer {token}"})
|
|
assert mock_fetch.call_count == 1
|