Three concrete changes addressing the "analyst Claude misuses the CLI" class of bugs (image.png table — issues #3, #5, plus the recurrent "how big is this table" guesswork): 1. config/claude_md_template.txt — the template agnes init writes to <workspace>/CLAUDE.md. Surfaces every catalog-row field with a why, adds a query_mode-based decision tree, explicit --estimate scoping (snapshot create ONLY — was the #1 first-try error), an agnes fetch → agnes snapshot create rename note, and a 6-row failure-mode table that maps each common error wording to its right next step. 2. app/api/v2_catalog.py — populate rough_size_hint for local + materialized rows from the on-disk parquet size, bucketed small/medium/large/very_large. Was hardcoded null with a TODO; AI couldn't tell "is this 6.8 GB" without a failed --remote round-trip. 3. cli/update_check.py — the [update] banner survived the da→agnes rename and printed "[update] da X is out of date" on every command, training analysts to associate the binary with the old name. Verified by rendering the template against representative contexts (33/33 tests pass) and running every use case from the original screenshot through the real CLI against a dev VM.
134 lines
5 KiB
Python
134 lines
5 KiB
Python
"""GET /api/v2/catalog — list tables visible to caller (spec §3.1)."""
|
|
|
|
from __future__ import annotations
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from fastapi import APIRouter, Depends
|
|
import duckdb
|
|
|
|
from app.auth.dependencies import get_current_user, _get_db
|
|
from app.utils import get_data_dir as _get_data_dir
|
|
from src.rbac import can_access_table
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
from app.api.v2_cache import TTLCache
|
|
|
|
router = APIRouter(prefix="/api/v2", tags=["v2"])
|
|
|
|
# Global cache of the raw table_registry rows. RBAC is enforced PER REQUEST
|
|
# against this list, mirroring v2_schema.py / v2_sample.py — caching the
|
|
# RBAC-filtered payload per user used to leave revoked users seeing tables
|
|
# for up to TTL after a permission flip. Cache is single-keyed; the TTL
|
|
# matches the documented `api.catalog_cache_ttl_seconds` default at
|
|
# `config/instance.yaml.example`. The config knob isn't wired through yet
|
|
# (same status as schema/sample caches), so changing it in instance.yaml is
|
|
# a no-op — tracked separately.
|
|
_table_rows_cache = TTLCache(maxsize=1, ttl_seconds=300)
|
|
_TABLE_ROWS_KEY = "all"
|
|
|
|
|
|
def _flavor_for(source_type: str) -> str:
|
|
return "bigquery" if source_type == "bigquery" else "duckdb"
|
|
|
|
|
|
def _examples_for(source_type: str) -> list[str]:
|
|
if source_type == "bigquery":
|
|
return [
|
|
"event_date > DATE '2026-01-01'",
|
|
"country_code = 'CZ' AND platform = 'web'",
|
|
]
|
|
return []
|
|
|
|
|
|
def _fetch_hint(table_id: str, source_type: str) -> str:
|
|
if source_type == "bigquery":
|
|
return f"agnes snapshot create {table_id} --select <cols> --where '<BQ predicate>' --limit <N>"
|
|
return "already local — query directly via `agnes query`"
|
|
|
|
|
|
# Coarse size buckets for `rough_size_hint`. Boundaries chosen so an analyst
|
|
# Claude can decide tool by inspection: anything `large` or worse implies
|
|
# `agnes snapshot create` over `agnes query --remote`. Numbers reflect the
|
|
# default `bq_max_scan_bytes` 5 GiB ceiling — at "large" you're already at
|
|
# half the per-query gate and a naive `--remote` is likely to refuse.
|
|
_SIZE_BUCKETS = (
|
|
(10 * 2**20, "small"), # ≤10 MiB
|
|
(100 * 2**20, "small"), # ≤100 MiB still small (analyst-laptop scale)
|
|
(1 * 2**30, "medium"), # ≤1 GiB
|
|
(10 * 2**30, "large"), # ≤10 GiB
|
|
)
|
|
|
|
|
|
def _bucket_size(byte_count: int) -> str:
|
|
for cap, label in _SIZE_BUCKETS:
|
|
if byte_count <= cap:
|
|
return label
|
|
return "very_large"
|
|
|
|
|
|
def _materialized_size_hint(table_id: str, source_type: str, query_mode: str) -> str | None:
|
|
"""Return a rough size bucket for a row whose data is on the server's
|
|
local filesystem (any `query_mode` that produces a parquet — `local` and
|
|
`materialized`). Returns ``None`` for `remote` (size requires a BQ
|
|
INFORMATION_SCHEMA round-trip; tracked separately) and for tables whose
|
|
parquet hasn't been materialised yet so the AI gets ``null`` not a
|
|
misleading "small".
|
|
|
|
Layout matches the v2 extract.duckdb contract:
|
|
${DATA_DIR}/extracts/<source_type>/data/<table_id>.parquet
|
|
"""
|
|
if query_mode == "remote":
|
|
return None
|
|
if not source_type:
|
|
return None
|
|
try:
|
|
path = Path(_get_data_dir()) / "extracts" / source_type / "data" / f"{table_id}.parquet"
|
|
if not path.exists():
|
|
return None
|
|
return _bucket_size(path.stat().st_size)
|
|
except Exception:
|
|
# Filesystem stat() race / permissions / weird DATA_DIR — fall back
|
|
# to null rather than crash the whole catalog response.
|
|
return None
|
|
|
|
|
|
def build_catalog(conn: duckdb.DuckDBPyConnection, user: dict) -> dict:
|
|
rows = _table_rows_cache.get(_TABLE_ROWS_KEY)
|
|
if rows is None:
|
|
repo = TableRegistryRepository(conn)
|
|
rows = repo.list_all()
|
|
_table_rows_cache.set(_TABLE_ROWS_KEY, rows)
|
|
|
|
# RBAC is enforced fresh per request. Revoking a user's access to a
|
|
# table takes effect on their next call to this endpoint, not after the
|
|
# cache TTL expires.
|
|
visible = []
|
|
for r in rows:
|
|
if not can_access_table(user, r["id"], conn):
|
|
continue
|
|
visible.append({
|
|
"id": r["id"],
|
|
"name": r.get("name") or r["id"],
|
|
"description": r.get("description") or "",
|
|
"source_type": r.get("source_type") or "",
|
|
"query_mode": r.get("query_mode") or "local",
|
|
"sql_flavor": _flavor_for(r.get("source_type") or ""),
|
|
"where_examples": _examples_for(r.get("source_type") or ""),
|
|
"fetch_via": _fetch_hint(r["id"], r.get("source_type") or ""),
|
|
"rough_size_hint": _materialized_size_hint(
|
|
r["id"], r.get("source_type") or "",
|
|
r.get("query_mode") or "local",
|
|
),
|
|
})
|
|
|
|
return {
|
|
"tables": visible,
|
|
"server_time": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
|
|
@router.get("/catalog")
|
|
async def catalog(
|
|
user: dict = Depends(get_current_user),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
return build_catalog(conn, user)
|