Closes #254 (agnes sample alias), #255 (wide-table render), #256 (single-flight on bq-metadata-refresh + run_id), #257 (init wording), #258 (progress bar clamp). Tier B trackers left open: #259 (init resume), #260 (stale .lock), #261 (schema cold-start), #262 (docker disk).
267 lines
8.9 KiB
Python
267 lines
8.9 KiB
Python
"""End-to-end tests for the three bq_metadata_refresh endpoints."""
|
|
|
|
from unittest.mock import patch
|
|
|
|
from app.api._metadata_models import TableMetadata
|
|
|
|
|
|
def _register_remote(seeded_app, table_id: str):
|
|
from src.db import get_system_db
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
conn = get_system_db()
|
|
try:
|
|
TableRegistryRepository(conn).register(
|
|
name=table_id,
|
|
id=table_id,
|
|
source_type="bigquery",
|
|
bucket="dwh_base",
|
|
source_table=table_id,
|
|
query_mode="remote",
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
# ─── POST /api/admin/run-bq-metadata-refresh ──────────────────────────────
|
|
|
|
|
|
def test_run_refresh_walks_remote_rows_and_upserts(seeded_app):
|
|
from src.db import get_system_db
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
|
|
_register_remote(seeded_app, "a_remote")
|
|
_register_remote(seeded_app, "b_remote")
|
|
|
|
fake = TableMetadata(
|
|
rows=5, size_bytes=512, partition_by="d", clustered_by=["c"],
|
|
)
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
with patch("connectors.bigquery.metadata.fetch", return_value=fake):
|
|
r = c.post(
|
|
"/api/admin/run-bq-metadata-refresh",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
body = r.json()
|
|
assert body["total"] >= 2
|
|
assert body["succeeded"] >= 2
|
|
assert body["failed"] == 0
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
repo = BqMetadataCacheRepository(conn)
|
|
for tid in ("a_remote", "b_remote"):
|
|
row = repo.get(tid)
|
|
assert row is not None
|
|
assert row["rows"] == 5
|
|
assert row["size_bytes"] == 512
|
|
assert row["partition_by"] == "d"
|
|
assert row["clustered_by"] == ["c"]
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_run_refresh_marks_error_on_provider_failure(seeded_app):
|
|
from src.db import get_system_db
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
|
|
_register_remote(seeded_app, "boom")
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
with patch(
|
|
"connectors.bigquery.metadata.fetch",
|
|
side_effect=RuntimeError("BQ throttle"),
|
|
):
|
|
r = c.post(
|
|
"/api/admin/run-bq-metadata-refresh",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200
|
|
body = r.json()
|
|
assert body["failed"] >= 1
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
row = BqMetadataCacheRepository(conn).get("boom")
|
|
assert row is not None
|
|
assert row["error_at"] is not None
|
|
assert "BQ throttle" in (row["error_msg"] or "")
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_run_refresh_requires_admin(seeded_app):
|
|
c = seeded_app["client"]
|
|
# No Authorization header → 401.
|
|
r = c.post("/api/admin/run-bq-metadata-refresh")
|
|
assert r.status_code == 401
|
|
|
|
|
|
def test_run_refresh_returns_run_id_and_started_at(seeded_app):
|
|
"""Issue #256: response now carries `run_id` + `started_at` so two
|
|
log streams (server + client) can correlate against the same run."""
|
|
_register_remote(seeded_app, "for_run_id")
|
|
fake = TableMetadata(rows=1, size_bytes=1)
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
with patch("connectors.bigquery.metadata.fetch", return_value=fake):
|
|
r = c.post(
|
|
"/api/admin/run-bq-metadata-refresh",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
body = r.json()
|
|
assert "run_id" in body and len(body["run_id"]) == 8
|
|
assert "started_at" in body and body["started_at"]
|
|
|
|
|
|
def test_concurrent_refresh_returns_409_already_running(seeded_app):
|
|
"""Issue #256: second concurrent POST receives 409 instead of doing
|
|
duplicate BQ work. Implemented via module-level asyncio.Lock."""
|
|
import asyncio
|
|
import httpx
|
|
|
|
from app.api import bq_metadata_refresh as mod
|
|
|
|
_register_remote(seeded_app, "concurrent_t")
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
# Simulate "refresh in flight" by holding the module-level lock
|
|
# ourselves and asserting the endpoint returns 409 immediately.
|
|
# `asyncio.Lock` requires a running loop to acquire; use a fresh one.
|
|
async def _hold_lock_and_call():
|
|
async with mod._refresh_lock:
|
|
mod._refresh_state["run_id"] = "abcd1234"
|
|
mod._refresh_state["started_at"] = "2026-05-12T13:00:00+00:00"
|
|
try:
|
|
# Call via TestClient (sync) — locking is module-level so the
|
|
# endpoint handler sees the lock held.
|
|
return c.post(
|
|
"/api/admin/run-bq-metadata-refresh",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
finally:
|
|
mod._refresh_state["run_id"] = None
|
|
mod._refresh_state["started_at"] = None
|
|
|
|
r = asyncio.new_event_loop().run_until_complete(_hold_lock_and_call())
|
|
assert r.status_code == 409, r.text
|
|
detail = r.json()["detail"]
|
|
assert detail["reason"] == "already_running"
|
|
assert detail["run_id"] == "abcd1234"
|
|
assert detail["started_at"] == "2026-05-12T13:00:00+00:00"
|
|
|
|
|
|
# ─── POST /api/v2/metadata-cache/refresh?table= ───────────────────────────
|
|
|
|
|
|
def test_refresh_one_table_endpoint(seeded_app):
|
|
from src.db import get_system_db
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
|
|
_register_remote(seeded_app, "single")
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
fake = TableMetadata(rows=99, size_bytes=999)
|
|
with patch("connectors.bigquery.metadata.fetch", return_value=fake):
|
|
r = c.post(
|
|
"/api/v2/metadata-cache/refresh?table=single",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
assert r.json()["status"] == "ok"
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
row = BqMetadataCacheRepository(conn).get("single")
|
|
assert row["rows"] == 99
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_refresh_one_table_unknown_id_returns_404(seeded_app):
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/v2/metadata-cache/refresh?table=does_not_exist",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_refresh_one_table_rejects_non_remote(seeded_app):
|
|
from src.db import get_system_db
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
TableRegistryRepository(conn).register(
|
|
name="local_t",
|
|
id="local_t",
|
|
source_type="keboola",
|
|
bucket="in.c-x",
|
|
source_table="t",
|
|
query_mode="local",
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/v2/metadata-cache/refresh?table=local_t",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 400
|
|
|
|
|
|
# ─── GET /api/v2/metadata-cache/status ────────────────────────────────────
|
|
|
|
|
|
def test_status_endpoint_returns_per_row_freshness(seeded_app):
|
|
from src.db import get_system_db
|
|
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
BqMetadataCacheRepository(conn).upsert_success(
|
|
"orders", rows=1, size_bytes=1, partition_by=None, clustered_by=None,
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.get(
|
|
"/api/v2/metadata-cache/status",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200, r.text
|
|
body = r.json()
|
|
assert "scheduler_interval_seconds" in body
|
|
assert "fresh_threshold_seconds" in body
|
|
assert body["fresh_threshold_seconds"] == 2 * body["scheduler_interval_seconds"]
|
|
orders = next(t for t in body["tables"] if t["table_id"] == "orders")
|
|
assert orders["freshness"] == "fresh"
|
|
|
|
|
|
def test_status_endpoint_does_not_require_admin(seeded_app):
|
|
"""Non-admin analyst tools (CLI, Claude Code) need this surface."""
|
|
c = seeded_app["client"]
|
|
# No token at all → 401 (auth still required, just not admin).
|
|
r = c.get("/api/v2/metadata-cache/status")
|
|
assert r.status_code == 401
|
|
# Any authenticated user works — seeded_app's admin_token is the
|
|
# easiest valid bearer; downgrade once the test harness exposes a
|
|
# plain-user token.
|
|
token = seeded_app["admin_token"]
|
|
r = c.get(
|
|
"/api/v2/metadata-cache/status",
|
|
headers={"Authorization": f"Bearer {token}"},
|
|
)
|
|
assert r.status_code == 200
|