Closes #254 (agnes sample alias), #255 (wide-table render), #256 (single-flight on bq-metadata-refresh + run_id), #257 (init wording), #258 (progress bar clamp). Tier B trackers left open: #259 (init resume), #260 (stale .lock), #261 (schema cold-start), #262 (docker disk).
362 lines
14 KiB
Python
362 lines
14 KiB
Python
"""BigQuery metadata cache refresh — owner of the ``bq_metadata_cache``
|
||
write path.
|
||
|
||
Three endpoints share this module:
|
||
|
||
- ``POST /api/admin/run-bq-metadata-refresh`` — called by the scheduler
|
||
container (auth: shared scheduler token resolves to a synthetic admin
|
||
user). Walks remote rows in ``table_registry``, fetches each via the
|
||
BigQuery metadata provider, UPSERTs into ``bq_metadata_cache``.
|
||
|
||
- ``POST /api/v2/metadata-cache/refresh?table=<id>`` — admin-gated, for
|
||
operator on-demand refresh of a single row (e.g. after editing the
|
||
registry entry's ``bucket`` / ``source_table``).
|
||
|
||
- ``GET /api/v2/metadata-cache/status`` — auth required, NOT admin-only.
|
||
Returns per-row freshness so analyst tooling (CLI / Claude Code) can
|
||
decide whether to trust the cached numbers or wait for a refresh.
|
||
|
||
Why this lives outside the catalog endpoint
|
||
-------------------------------------------
|
||
Earlier releases inlined a per-row BigQuery fetch into ``GET /api/v2/catalog``.
|
||
On cold caches that became O(N) sequential BQ jobs API roundtrips inside
|
||
one HTTP request — easily 90 s+ on partitioned tables — and reliably blew
|
||
the CLI's 30 s ``httpx.ReadTimeout``. Moving the fetch off the hot path
|
||
into a scheduled refresh job (default every 4 h, configurable via
|
||
``SCHEDULER_BQ_METADATA_REFRESH_INTERVAL``) keeps the catalog response
|
||
under tens of milliseconds even at first boot, at the cost of metadata
|
||
being up to one refresh-interval stale. The freshness field surfaces
|
||
that explicitly.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
import time
|
||
from datetime import datetime, timedelta, timezone
|
||
from typing import Any, Optional
|
||
|
||
import duckdb
|
||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||
|
||
from app.auth.access import require_admin
|
||
from app.auth.dependencies import _get_db, get_current_user
|
||
from src.repositories.bq_metadata_cache import BqMetadataCacheRepository
|
||
from src.repositories.table_registry import TableRegistryRepository
|
||
|
||
logger = logging.getLogger(__name__)
|
||
router = APIRouter()
|
||
|
||
|
||
# ─── Freshness thresholds ──────────────────────────────────────────────────
|
||
|
||
|
||
def _scheduler_interval_seconds() -> int:
|
||
"""Return the scheduler's configured refresh interval, mirroring
|
||
``services/scheduler/__main__.py``. We re-read the env var instead
|
||
of importing the scheduler module because the scheduler runs in a
|
||
sibling container and is not on the app's import path.
|
||
"""
|
||
raw = os.environ.get("SCHEDULER_BQ_METADATA_REFRESH_INTERVAL")
|
||
if raw is None or raw == "":
|
||
return 4 * 60 * 60 # 4 h default
|
||
try:
|
||
value = int(raw)
|
||
except (TypeError, ValueError):
|
||
return 4 * 60 * 60
|
||
return value if value > 0 else 4 * 60 * 60
|
||
|
||
|
||
def _fresh_threshold_seconds() -> int:
|
||
"""A row is ``fresh`` when refreshed within this window.
|
||
|
||
Two refresh intervals: one refresh might fail (network blip, BQ
|
||
throttle); the analyst should keep seeing the last-known-good row
|
||
as ``fresh`` until two consecutive refreshes have passed without
|
||
success. Beyond that, the response surfaces ``stale`` so the
|
||
consumer knows the numbers might be outdated.
|
||
"""
|
||
return 2 * _scheduler_interval_seconds()
|
||
|
||
|
||
def compute_freshness(
|
||
cache_row: Optional[dict[str, Any]],
|
||
*,
|
||
now: Optional[datetime] = None,
|
||
fresh_threshold: Optional[int] = None,
|
||
) -> str:
|
||
"""Classify a cache row's freshness.
|
||
|
||
- ``never_fetched``: no row, or no successful refresh yet.
|
||
- ``fresh``: refreshed within the threshold.
|
||
- ``stale``: refreshed earlier than the threshold.
|
||
- ``error``: most recent attempt failed and there is no prior success
|
||
(success row is preserved across errors — analyst keeps using
|
||
last-known-good numbers).
|
||
"""
|
||
if cache_row is None:
|
||
return "never_fetched"
|
||
refreshed_at = cache_row.get("refreshed_at")
|
||
error_at = cache_row.get("error_at")
|
||
if refreshed_at is None:
|
||
return "error" if error_at is not None else "never_fetched"
|
||
threshold = fresh_threshold if fresh_threshold is not None else _fresh_threshold_seconds()
|
||
cutoff = (now or datetime.now(timezone.utc)) - timedelta(seconds=threshold)
|
||
# DuckDB returns naive datetimes for TIMESTAMP columns; treat as UTC.
|
||
if refreshed_at.tzinfo is None:
|
||
refreshed_at = refreshed_at.replace(tzinfo=timezone.utc)
|
||
return "fresh" if refreshed_at >= cutoff else "stale"
|
||
|
||
|
||
# ─── Single-row refresh primitive ──────────────────────────────────────────
|
||
|
||
|
||
def refresh_one(conn: duckdb.DuckDBPyConnection, row: dict[str, Any]) -> dict[str, Any]:
|
||
"""Fetch BQ metadata for one row and UPSERT the result.
|
||
|
||
Synchronous; safe to call from an anyio thread. Returns a small
|
||
outcome dict for the caller (counts, audit).
|
||
|
||
Failures are absorbed: the cache row's prior success is preserved
|
||
(``error_at`` + ``error_msg`` set, ``refreshed_at`` left alone).
|
||
"""
|
||
from app.api._metadata_models import MetadataRequest
|
||
from connectors.bigquery import metadata as bq_metadata
|
||
from src.identifier_validation import validate_quoted_identifier
|
||
|
||
table_id = row["id"]
|
||
bucket = row.get("bucket") or ""
|
||
source_table = row.get("source_table") or table_id
|
||
repo = BqMetadataCacheRepository(conn)
|
||
|
||
if not (
|
||
validate_quoted_identifier(bucket, "bucket")
|
||
and validate_quoted_identifier(source_table, "source_table")
|
||
):
|
||
repo.mark_error(table_id, "invalid bucket/source_table identifier")
|
||
return {"table_id": table_id, "status": "error", "error": "invalid identifier"}
|
||
|
||
req = MetadataRequest(
|
||
table_id=table_id, bucket=bucket, source_table=source_table,
|
||
)
|
||
try:
|
||
result = bq_metadata.fetch(req)
|
||
except Exception as e:
|
||
# bq_metadata.fetch is documented as never-raises, but defense in
|
||
# depth: catch any regression so one bad row doesn't kill the
|
||
# whole scheduler tick.
|
||
msg = f"{type(e).__name__}: {e}"
|
||
logger.warning("bq metadata refresh failed for %s: %s", table_id, msg)
|
||
repo.mark_error(table_id, msg)
|
||
return {"table_id": table_id, "status": "error", "error": msg}
|
||
|
||
if result is None:
|
||
repo.mark_error(table_id, "provider returned no data")
|
||
return {"table_id": table_id, "status": "no_data"}
|
||
|
||
repo.upsert_success(
|
||
table_id,
|
||
rows=result.rows,
|
||
size_bytes=result.size_bytes,
|
||
partition_by=result.partition_by,
|
||
clustered_by=result.clustered_by,
|
||
entity_type=result.entity_type,
|
||
known_columns=result.known_columns,
|
||
)
|
||
return {
|
||
"table_id": table_id,
|
||
"status": "ok",
|
||
"rows": result.rows,
|
||
"size_bytes": result.size_bytes,
|
||
"entity_type": result.entity_type,
|
||
}
|
||
|
||
|
||
def _list_remote_bq_rows(conn: duckdb.DuckDBPyConnection) -> list[dict[str, Any]]:
|
||
rows = TableRegistryRepository(conn).list_all()
|
||
return [
|
||
r for r in rows
|
||
if r.get("query_mode") == "remote" and r.get("source_type") == "bigquery"
|
||
]
|
||
|
||
|
||
def _refresh_concurrency() -> int:
|
||
raw = os.environ.get("AGNES_BQ_METADATA_REFRESH_CONCURRENCY", "4")
|
||
try:
|
||
value = int(raw)
|
||
except (TypeError, ValueError):
|
||
return 4
|
||
return value if value > 0 else 4
|
||
|
||
|
||
# ─── Single-flight state ──────────────────────────────────────────────────
|
||
#
|
||
# Module-level guard so a second concurrent
|
||
# ``POST /api/admin/run-bq-metadata-refresh`` doesn't fan out duplicate
|
||
# BQ work. Pre-0.52 the second call would happily run its own loop and
|
||
# do 2× BQ jobs-API traffic against the same set of tables for the same
|
||
# eventual UPSERT result — confirmed by stress test C on 2026-05-12.
|
||
# DuckDB MVCC kept the rows consistent, but BQ quota leaked.
|
||
#
|
||
# Semantics: while a refresh is running, additional callers get
|
||
# ``409 already_running`` with the in-flight ``run_id`` + ``started_at``
|
||
# so they can correlate against logs. Scheduler treats 409 as a no-op
|
||
# success (next tick will fire again — usually 4 h later — and find
|
||
# the lock free).
|
||
_refresh_lock = asyncio.Lock()
|
||
_refresh_state: dict[str, Any] = {"run_id": None, "started_at": None}
|
||
|
||
|
||
# ─── Endpoints ─────────────────────────────────────────────────────────────
|
||
|
||
|
||
@router.post("/api/admin/run-bq-metadata-refresh")
|
||
async def run_bq_metadata_refresh(
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Refresh metadata for every remote BQ row in the registry.
|
||
|
||
Called by the scheduler at ``SCHEDULER_BQ_METADATA_REFRESH_INTERVAL``
|
||
(default 4 h). Single-flight guarded: if a refresh is already
|
||
running (e.g. operator clicked "Re-warm all" while a scheduler tick
|
||
is in flight, or two scheduler containers raced during an upgrade),
|
||
the second caller gets ``409 already_running`` with the in-flight
|
||
``run_id`` + ``started_at`` so they can correlate against logs.
|
||
The scheduler treats 409 as a no-op success.
|
||
|
||
Bounded concurrency within a run (default 4, override via
|
||
``AGNES_BQ_METADATA_REFRESH_CONCURRENCY``) so a deployment with
|
||
many remote tables doesn't fan out to dozens of parallel BQ jobs.
|
||
"""
|
||
import uuid
|
||
|
||
from src.db import get_system_db
|
||
|
||
if _refresh_lock.locked():
|
||
# Issue #256: emit 409 instead of doing 2× BQ work.
|
||
raise HTTPException(
|
||
status_code=409,
|
||
detail={
|
||
"reason": "already_running",
|
||
"run_id": _refresh_state.get("run_id"),
|
||
"started_at": _refresh_state.get("started_at"),
|
||
"hint": "A refresh is already in flight; this caller is a no-op.",
|
||
},
|
||
)
|
||
|
||
async with _refresh_lock:
|
||
run_id = uuid.uuid4().hex[:8]
|
||
started_at = datetime.now(timezone.utc).isoformat()
|
||
_refresh_state["run_id"] = run_id
|
||
_refresh_state["started_at"] = started_at
|
||
try:
|
||
rows = _list_remote_bq_rows(conn)
|
||
sem = asyncio.Semaphore(_refresh_concurrency())
|
||
|
||
async def _one(row: dict[str, Any]) -> dict[str, Any]:
|
||
async with sem:
|
||
# Each refresh_one call wants its own cursor; the singleton
|
||
# connection accessor returns a fresh cursor each call.
|
||
return await asyncio.to_thread(refresh_one, get_system_db(), row)
|
||
|
||
t0 = time.monotonic()
|
||
results = await asyncio.gather(
|
||
*(_one(r) for r in rows), return_exceptions=True,
|
||
)
|
||
duration_ms = int((time.monotonic() - t0) * 1000)
|
||
finally:
|
||
_refresh_state["run_id"] = None
|
||
_refresh_state["started_at"] = None
|
||
|
||
succeeded = sum(
|
||
1 for r in results if isinstance(r, dict) and r.get("status") == "ok"
|
||
)
|
||
no_data = sum(
|
||
1 for r in results if isinstance(r, dict) and r.get("status") == "no_data"
|
||
)
|
||
failed = sum(
|
||
1 for r in results
|
||
if isinstance(r, Exception)
|
||
or (isinstance(r, dict) and r.get("status") == "error")
|
||
)
|
||
|
||
logger.info(
|
||
"bq metadata refresh: run_id=%s total=%d ok=%d no_data=%d failed=%d duration_ms=%d",
|
||
run_id, len(rows), succeeded, no_data, failed, duration_ms,
|
||
)
|
||
return {
|
||
"run_id": run_id,
|
||
"started_at": started_at,
|
||
"total": len(rows),
|
||
"succeeded": succeeded,
|
||
"no_data": no_data,
|
||
"failed": failed,
|
||
"duration_ms": duration_ms,
|
||
}
|
||
|
||
|
||
@router.post("/api/v2/metadata-cache/refresh")
|
||
async def refresh_one_table(
|
||
table: str = Query(..., description="Registry table_id to refresh"),
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Operator on-demand refresh of one row.
|
||
|
||
Useful right after editing the registry row (so the catalog reflects
|
||
new ``bucket`` / ``source_table`` immediately) or after an upstream
|
||
BQ schema change that the operator wants reflected before the next
|
||
scheduled tick.
|
||
"""
|
||
from src.db import get_system_db
|
||
|
||
row = TableRegistryRepository(conn).get(table)
|
||
if not row:
|
||
raise HTTPException(status_code=404, detail=f"Unknown table_id: {table}")
|
||
if row.get("query_mode") != "remote" or row.get("source_type") != "bigquery":
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="Manual metadata refresh is only meaningful for remote BigQuery tables",
|
||
)
|
||
return await asyncio.to_thread(refresh_one, get_system_db(), row)
|
||
|
||
|
||
@router.get("/api/v2/metadata-cache/status")
|
||
def metadata_cache_status(
|
||
user: dict = Depends(get_current_user),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Per-table cache status. Non-admin — analyst tools rely on this to
|
||
decide whether to trust the catalog's ``rows`` / ``size_bytes`` or
|
||
treat the table as opaque until the next refresh.
|
||
"""
|
||
cache_rows = BqMetadataCacheRepository(conn).list_all()
|
||
threshold = _fresh_threshold_seconds()
|
||
now = datetime.now(timezone.utc)
|
||
interval = _scheduler_interval_seconds()
|
||
tables = []
|
||
for r in cache_rows:
|
||
refreshed_at = r.get("refreshed_at")
|
||
error_at = r.get("error_at")
|
||
tables.append({
|
||
"table_id": r["table_id"],
|
||
"refreshed_at": refreshed_at.isoformat() if refreshed_at else None,
|
||
"rows": r.get("rows"),
|
||
"size_bytes": r.get("size_bytes"),
|
||
"partition_by": r.get("partition_by"),
|
||
"clustered_by": r.get("clustered_by") or [],
|
||
"entity_type": r.get("entity_type"),
|
||
"known_columns": r.get("known_columns") or [],
|
||
"error_at": error_at.isoformat() if error_at else None,
|
||
"error_msg": r.get("error_msg"),
|
||
"freshness": compute_freshness(r, now=now, fresh_threshold=threshold),
|
||
})
|
||
return {
|
||
"scheduler_interval_seconds": interval,
|
||
"fresh_threshold_seconds": threshold,
|
||
"server_time": now.isoformat(),
|
||
"tables": tables,
|
||
}
|